80 files changed, 5387 insertions, 2610 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
index 757f6d44c..3b91b984a 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar")
+load("//tools:defs.bzl", "go_binary")
 
 package(licenses = ["notice"])
 
@@ -13,16 +13,7 @@ go_binary(
         "//visibility:public",
     ],
     x_defs = {"main.version": "{STABLE_VERSION}"},
-    deps = [
-        "//pkg/log",
-        "//pkg/refs",
-        "//pkg/sentry/platform",
-        "//runsc/boot",
-        "//runsc/cmd",
-        "//runsc/flag",
-        "//runsc/specutils",
-        "@com_github_google_subcommands//:go_default_library",
-    ],
+    deps = ["//runsc/cli"],
 )
 
 # The runsc-race target is a race-compatible BUILD target. This must be built
@@ -49,68 +40,7 @@ go_binary(
         "//visibility:public",
     ],
     x_defs = {"main.version": "{STABLE_VERSION}"},
-    deps = [
-        "//pkg/log",
-        "//pkg/refs",
-        "//pkg/sentry/platform",
-        "//runsc/boot",
-        "//runsc/cmd",
-        "//runsc/flag",
-        "//runsc/specutils",
-        "@com_github_google_subcommands//:go_default_library",
-    ],
-)
-
-pkg_tar(
-    name = "runsc-bin",
-    srcs = [":runsc"],
-    mode = "0755",
-    package_dir = "/usr/bin",
-    strip_prefix = "/runsc/linux_amd64_pure_stripped",
-)
-
-pkg_tar(
-    name = "debian-data",
-    extension = "tar.gz",
-    deps = [
-        ":runsc-bin",
-    ],
-)
-
-genrule(
-    name = "deb-version",
-    # Note that runsc must appear in the srcs parameter and not the tools
-    # parameter, otherwise it will not be stamped. This is reasonable, as tools
-    # may be encoded differently in the build graph (cached more aggressively
-    # because they are assumes to be hermetic).
-    srcs = [":runsc"],
-    outs = ["version.txt"],
-    # Note that the little dance here is necessary because files in the $(SRCS)
-    # attribute are not executable by default, and we can't touch in place.
-    cmd = "cp $(location :runsc) $(@D)/runsc && \
-        chmod a+x $(@D)/runsc && \
-        $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \
-        rm -f $(@D)/runsc",
-    stamp = 1,
-)
-
-pkg_deb(
-    name = "runsc-debian",
-    architecture = "amd64",
-    data = ":debian-data",
-    # Note that the description_file will be flatten (all newlines removed),
-    # and therefore it is kept to a simple one-line description. The expected
-    # format for debian packages is "short summary\nLonger explanation of
-    # tool." and this is impossible with the flattening.
-    description_file = "debian/description",
-    homepage = "https://gvisor.dev/",
-    maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
-    package = "runsc",
-    postinst = "debian/postinst.sh",
-    version_file = ":version.txt",
-    visibility = [
-        "//visibility:public",
-    ],
+    deps = ["//runsc/cli"],
 )
 
 sh_test(
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index a907c103b..b97dc3c47 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -8,7 +8,6 @@ go_library(
         "compat.go",
         "compat_amd64.go",
         "compat_arm64.go",
-        "config.go",
         "controller.go",
         "debug.go",
         "events.go",
@@ -27,19 +26,25 @@ go_library(
     deps = [
         "//pkg/abi",
         "//pkg/abi/linux",
+        "//pkg/bpf",
+        "//pkg/cleanup",
         "//pkg/context",
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
         "//pkg/rand",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/control",
         "//pkg/sentry/devices/memdev",
+        "//pkg/sentry/devices/ttydev",
+        "//pkg/sentry/devices/tundev",
         "//pkg/sentry/fdimport",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/dev",
@@ -53,8 +58,10 @@ go_library(
         "//pkg/sentry/fs/user",
         "//pkg/sentry/fsimpl/devpts",
         "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/fuse",
         "//pkg/sentry/fsimpl/gofer",
         "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/fsimpl/overlay",
         "//pkg/sentry/fsimpl/proc",
         "//pkg/sentry/fsimpl/sys",
         "//pkg/sentry/fsimpl/tmpfs",
@@ -86,6 +93,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/link/fdbased",
         "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/packetsocket",
         "//pkg/tcpip/link/qdisc/fifo",
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/network/arp",
@@ -100,9 +108,11 @@ go_library(
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
         "//runsc/boot/pprof",
+        "//runsc/config",
         "//runsc/specutils",
-        "@com_github_golang_protobuf//proto:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "//runsc/specutils/seccomp",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -118,6 +128,7 @@ go_test(
     library = ":boot",
     deps = [
         "//pkg/control/server",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
@@ -126,8 +137,9 @@ go_test(
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/unet",
+        "//runsc/config",
         "//runsc/fsgofer",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 84c67cbc2..7076ae2e2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -19,7 +19,7 @@ import (
 	"os"
 	"syscall"
 
-	"github.com/golang/protobuf/proto"
+	"google.golang.org/protobuf/proto"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 8125d5061..4e0f0d57a 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,6 +22,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -29,10 +30,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/state"
 	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -101,14 +104,13 @@ const (
 
 // Profiling related commands (see pprof.go for more details).
 const (
-	StartCPUProfile  = "Profile.StartCPUProfile"
-	StopCPUProfile   = "Profile.StopCPUProfile"
-	HeapProfile      = "Profile.HeapProfile"
-	GoroutineProfile = "Profile.GoroutineProfile"
-	BlockProfile     = "Profile.BlockProfile"
-	MutexProfile     = "Profile.MutexProfile"
-	StartTrace       = "Profile.StartTrace"
-	StopTrace        = "Profile.StopTrace"
+	StartCPUProfile = "Profile.StartCPUProfile"
+	StopCPUProfile  = "Profile.StopCPUProfile"
+	HeapProfile     = "Profile.HeapProfile"
+	BlockProfile    = "Profile.BlockProfile"
+	MutexProfile    = "Profile.MutexProfile"
+	StartTrace      = "Profile.StartTrace"
+	StopTrace       = "Profile.StopTrace"
 )
 
 // Logging related commands (see logging.go for more details).
@@ -129,42 +131,52 @@ type controller struct {
 
 	// manager holds the containerManager methods.
 	manager *containerManager
+
+	// pprop holds the profile instance if enabled. It may be nil.
+	pprof *control.Profile
 }
 
 // newController creates a new controller. The caller must call
 // controller.srv.StartServing() to start the controller.
 func newController(fd int, l *Loader) (*controller, error) {
-	srv, err := server.CreateFromFD(fd)
+	ctrl := &controller{}
+	var err error
+	ctrl.srv, err = server.CreateFromFD(fd)
 	if err != nil {
 		return nil, err
 	}
 
-	manager := &containerManager{
+	ctrl.manager = &containerManager{
 		startChan:       make(chan struct{}),
 		startResultChan: make(chan error),
 		l:               l,
 	}
-	srv.Register(manager)
+	ctrl.srv.Register(ctrl.manager)
 
 	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
-		srv.Register(net)
+		ctrl.srv.Register(net)
 	}
 
-	srv.Register(&debug{})
-	srv.Register(&control.Logging{})
-	if l.conf.ProfileEnable {
-		srv.Register(&control.Profile{
-			Kernel: l.k,
-		})
+	ctrl.srv.Register(&debug{})
+	ctrl.srv.Register(&control.Logging{})
+
+	if l.root.conf.ProfileEnable {
+		ctrl.pprof = &control.Profile{Kernel: l.k}
+		ctrl.srv.Register(ctrl.pprof)
 	}
 
-	return &controller{
-		srv:     srv,
-		manager: manager,
-	}, nil
+	return ctrl, nil
+}
+
+func (c *controller) stop() {
+	if c.pprof != nil {
+		// These are noop if there is nothing being profiled.
+		_ = c.pprof.StopCPUProfile(nil, nil)
+		_ = c.pprof.StopTrace(nil, nil)
+	}
 }
 
 // containerManager manages sandbox containers.
@@ -211,7 +223,7 @@ type StartArgs struct {
 	Spec *specs.Spec
 
 	// Config is the runsc-specific configuration for the sandbox.
-	Conf *Config
+	Conf *config.Config
 
 	// CID is the ID of the container to start.
 	CID string
@@ -247,13 +259,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	// All validation passed, logs the spec for debugging.
 	specutils.LogSpec(args.Spec)
 
-	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	fds, err := fd.NewFromFiles(args.FilePayload.Files)
 	if err != nil {
+		return err
+	}
+	defer func() {
+		for _, fd := range fds {
+			_ = fd.Close()
+		}
+	}()
+	if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
 	}
 	log.Debugf("Container %q started", args.CID)
-
 	return nil
 }
 
@@ -333,7 +352,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	// Pause the kernel while we build a new one.
 	cm.l.k.Pause()
 
-	p, err := createPlatform(cm.l.conf, deviceFile)
+	p, err := createPlatform(cm.l.root.conf, deviceFile)
 	if err != nil {
 		return fmt.Errorf("creating platform: %v", err)
 	}
@@ -349,12 +368,20 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
-	mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
-	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
-	if err != nil {
-		return fmt.Errorf("creating RestoreEnvironment: %v", err)
+	ctx := k.SupervisorContext()
+	mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
+	if kernel.VFS2Enabled {
+		ctx, err = mntr.configureRestore(ctx, cm.l.root.conf)
+		if err != nil {
+			return fmt.Errorf("configuring filesystem restore: %v", err)
+		}
+	} else {
+		renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
+		if err != nil {
+			return fmt.Errorf("creating RestoreEnvironment: %v", err)
+		}
+		fs.SetRestoreEnvironment(*renv)
 	}
-	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
 	if eps, ok := networkStack.(*netstack.Stack); ok {
@@ -368,7 +395,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("file cannot be empty")
 	}
 
-	if cm.l.conf.ProfileEnable {
+	if cm.l.root.conf.ProfileEnable {
 		// pprof.Initialize opens /proc/self/maps, so has to be called before
 		// installing seccomp filters.
 		pprof.Initialize()
@@ -381,19 +408,19 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 
 	// Load the state.
 	loadOpts := state.LoadOpts{Source: specFile}
-	if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil {
+	if err := loadOpts.Load(ctx, k, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil {
 		return err
 	}
 
 	// Since we have a new kernel we also must make a new watchdog.
 	dogOpts := watchdog.DefaultOpts
-	dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+	dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction
 	dog := watchdog.New(k, dogOpts)
 
 	// Change the loader fields to reflect the changes made when restoring.
 	cm.l.k = k
 	cm.l.watchdog = dog
-	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+	cm.l.root.procArgs = kernel.CreateProcessArgs{}
 	cm.l.restore = true
 
 	// Reinitialize the sandbox ID and processes map. Note that it doesn't
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 60e33425f..a7c4ebb0c 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -27,41 +27,30 @@ import (
 // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_CLOCK_GETTIME: {},
-	syscall.SYS_CLONE: []seccomp.Rule{
-		{
-			seccomp.AllowValue(
-				syscall.CLONE_VM |
-					syscall.CLONE_FS |
-					syscall.CLONE_FILES |
-					syscall.CLONE_SIGHAND |
-					syscall.CLONE_SYSVSEM |
-					syscall.CLONE_THREAD),
-		},
-	},
-	syscall.SYS_CLOSE: {},
-	syscall.SYS_DUP:   {},
+	syscall.SYS_CLOSE:         {},
+	syscall.SYS_DUP:           {},
 	syscall.SYS_DUP3: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.O_CLOEXEC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.O_CLOEXEC),
 		},
 	},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EVENTFD2: []seccomp.Rule{
 		{
-			seccomp.AllowValue(0),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(0),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EXIT:       {},
@@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FCHMOD:     {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_SETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_SETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFD),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFD),
 		},
 	},
 	syscall.SYS_FSTAT:     {},
@@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FTRUNCATE: {},
 	syscall.SYS_FUTEX: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
 		},
 		// Non-private variants are included for flipcall support. They are otherwise
 		// unncessary, as the sentry will use only private futexes internally.
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE),
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE),
+			seccomp.MatchAny{},
 		},
 	},
 	syscall.SYS_GETPID: {},
 	unix.SYS_GETRANDOM: {},
 	syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_DOMAIN),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_DOMAIN),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_TYPE),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_TYPE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_ERROR),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_ERROR),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_SNDBUF),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_SNDBUF),
 		},
 	},
 	syscall.SYS_GETTID:       {},
@@ -141,38 +130,44 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// setting/getting termios and winsize.
 	syscall.SYS_IOCTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCGETS),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCGETS),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETS),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETS),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETSF),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETSF),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETSW),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETSW),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TIOCSWINSZ),
-			seccomp.AllowAny{}, /* winsize struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TIOCSWINSZ),
+			seccomp.MatchAny{}, /* winsize struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TIOCGWINSZ),
-			seccomp.AllowAny{}, /* winsize struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TIOCGWINSZ),
+			seccomp.MatchAny{}, /* winsize struct */
 		},
 	},
 	syscall.SYS_LSEEK:   {},
 	syscall.SYS_MADVISE: {},
+	unix.SYS_MEMBARRIER: []seccomp.Rule{
+		{
+			seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL),
+			seccomp.EqualTo(0),
+		},
+	},
 	syscall.SYS_MINCORE: {},
 	// Used by the Go runtime as a temporarily workaround for a Linux
 	// 5.2-5.4 bug.
@@ -182,46 +177,46 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// TODO(b/148688965): Remove once this is gone from Go.
 	syscall.SYS_MLOCK: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(4096),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(4096),
 		},
 	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_SHARED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_SHARED),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ),
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
 		},
 	},
 	syscall.SYS_MPROTECT:  {},
@@ -237,32 +232,32 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_READ:      {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
 		},
 	},
 	syscall.SYS_RECVMMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
-			seccomp.AllowValue(syscall.MSG_DONTWAIT),
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(fdbased.MaxMsgsPerRecv),
+			seccomp.EqualTo(syscall.MSG_DONTWAIT),
+			seccomp.EqualTo(0),
 		},
 	},
 	unix.SYS_SENDMMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT),
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_RESTART_SYSCALL: {},
@@ -272,57 +267,50 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SCHED_YIELD:     {},
 	syscall.SYS_SENDMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
 		},
 	},
 	syscall.SYS_SETITIMER: {},
 	syscall.SYS_SHUTDOWN: []seccomp.Rule{
 		// Used by fs/host to shutdown host sockets.
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)},
 		// Used by unet to shutdown connections.
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
 	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
 	syscall.SYS_TEE: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(1),                      /* len */
-			seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(1),                      /* len */
+			seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */
 		},
 	},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
-			seccomp.AllowValue(uint64(os.Getpid())),
+			seccomp.EqualTo(uint64(os.Getpid())),
 		},
 	},
 	syscall.SYS_UTIMENSAT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0), /* null pathname */
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0), /* flags */
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0), /* null pathname */
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0), /* flags */
 		},
 	},
 	syscall.SYS_WRITE: {},
-	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
-	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
-	// option is enabled for a packet socket.
+	// For rawfile.NonBlockingWriteIovec.
 	syscall.SYS_WRITEV: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(2),
-		},
-		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(3),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.GreaterThan(0),
 		},
 	},
 }
@@ -332,10 +320,10 @@ func hostInetFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ACCEPT4: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
 			},
 		},
 		syscall.SYS_BIND:        {},
@@ -344,84 +332,84 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_GETSOCKNAME: {},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_TOS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVTOS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_TCLASS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_ERROR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_ERROR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_KEEPALIVE),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_KEEPALIVE),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_SNDBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_RCVBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_REUSEADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_TYPE),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_TYPE),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_LINGER),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_LINGER),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_NODELAY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_NODELAY),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_INFO),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_INFO),
 			},
 		},
 		syscall.SYS_IOCTL: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.TIOCOUTQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.TIOCOUTQ),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.TIOCINQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.TIOCINQ),
 			},
 		},
 		syscall.SYS_LISTEN:   {},
@@ -432,103 +420,103 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_SENDTO:   {},
 		syscall.SYS_SETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_V6ONLY),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_SNDBUF),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_RCVBUF),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_REUSEADDR),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_NODELAY),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_NODELAY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_TOS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_TOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_RECVTOS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVTOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_TCLASS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_TCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 		},
 		syscall.SYS_SHUTDOWN: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_RD),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_RD),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_WR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_WR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_RDWR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_RDWR),
 			},
 		},
 		syscall.SYS_SOCKET: []seccomp.Rule{
 			{
-				seccomp.AllowValue(syscall.AF_INET),
-				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET),
+				seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET),
-				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET),
+				seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET6),
-				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET6),
+				seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET6),
-				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET6),
+				seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 		},
 		syscall.SYS_WRITEV: {},
@@ -539,20 +527,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ACCEPT: []seccomp.Rule{
 			{
-				seccomp.AllowValue(fd),
+				seccomp.EqualTo(fd),
 			},
 		},
 		syscall.SYS_LISTEN: []seccomp.Rule{
 			{
-				seccomp.AllowValue(fd),
-				seccomp.AllowValue(16 /* unet.backlog */),
+				seccomp.EqualTo(fd),
+				seccomp.EqualTo(16 /* unet.backlog */),
 			},
 		},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_PEERCRED),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_PEERCRED),
 			},
 		},
 	}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
index 5335ff82c..cea5613b8 100644
--- a/runsc/boot/filter/config_amd64.go
+++ b/runsc/boot/filter/config_amd64.go
@@ -24,8 +24,41 @@ import (
 )
 
 func init() {
-	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
-		seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	)
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+		// TODO(b/168828518): No longer used in Go 1.16+.
+		{seccomp.EqualTo(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		// parent_tidptr and child_tidptr are always 0 because neither
+		// CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SETTLS |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+		{
+			// TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+	}
 }
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
index 7fa9bbda3..37313f97f 100644
--- a/runsc/boot/filter/config_arm64.go
+++ b/runsc/boot/filter/config_arm64.go
@@ -16,6 +16,29 @@
 
 package filter
 
-// Reserve for future customization.
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
 func init() {
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			// These arguments are left uninitialized by the Go
+			// runtime, so they may be anything (and are unused by
+			// the host).
+			seccomp.MatchAny{}, // parent_tidptr
+			seccomp.MatchAny{}, // tls
+			seccomp.MatchAny{}, // child_tidptr
+		},
+	}
 }
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
index 194952a7b..7b8669595 100644
--- a/runsc/boot/filter/config_profile.go
+++ b/runsc/boot/filter/config_profile.go
@@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_OPENAT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
 			},
 		},
 	}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index 5e5a3c998..209e646a7 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -26,6 +26,8 @@ import (
 func instrumentationFilters() seccomp.SyscallRules {
 	Report("MSAN is enabled: syscall filters less restrictive!")
 	return seccomp.SyscallRules{
+		syscall.SYS_CLONE:             {},
+		syscall.SYS_MMAP:              {},
 		syscall.SYS_SCHED_GETAFFINITY: {},
 		syscall.SYS_SET_ROBUST_LIST:   {},
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index b98a1eb50..6b6ae98d7 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -29,10 +29,12 @@ import (
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
@@ -47,6 +49,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -65,7 +68,7 @@ const (
 // tmpfs has some extra supported options that we must pass through.
 var tmpfsAllowedData = []string{"mode", "uid", "gid"}
 
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
 	upperFlags := lowerFlags
 	upperFlags.ReadOnly = false
@@ -102,33 +105,28 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
 // mandatory mounts that are required by the OCI specification.
 func compileMounts(spec *specs.Spec) []specs.Mount {
 	// Keep track of whether proc and sys were mounted.
-	var procMounted, sysMounted bool
+	var procMounted, sysMounted, devMounted, devptsMounted bool
 	var mounts []specs.Mount
 
-	// Always mount /dev.
-	mounts = append(mounts, specs.Mount{
-		Type:        devtmpfs.Name,
-		Destination: "/dev",
-	})
-
-	mounts = append(mounts, specs.Mount{
-		Type:        devpts.Name,
-		Destination: "/dev/pts",
-	})
-
 	// Mount all submounts from the spec.
 	for _, m := range spec.Mounts {
 		if !specutils.IsSupportedDevMount(m) {
 			log.Warningf("ignoring dev mount at %q", m.Destination)
 			continue
 		}
-		mounts = append(mounts, m)
 		switch filepath.Clean(m.Destination) {
 		case "/proc":
 			procMounted = true
 		case "/sys":
 			sysMounted = true
+		case "/dev":
+			m.Type = devtmpfs.Name
+			devMounted = true
+		case "/dev/pts":
+			m.Type = devpts.Name
+			devptsMounted = true
 		}
+		mounts = append(mounts, m)
 	}
 
 	// Mount proc and sys even if the user did not ask for it, as the spec
@@ -146,6 +144,18 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 			Destination: "/sys",
 		})
 	}
+	if !devMounted {
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        devtmpfs.Name,
+			Destination: "/dev",
+		})
+	}
+	if !devptsMounted {
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        devpts.Name,
+			Destination: "/dev/pts",
+		})
+	}
 
 	// The mandatory mounts should be ordered right after the root, in case
 	// there are submounts of these mandatory mounts already in the spec.
@@ -155,7 +165,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 }
 
 // p9MountData creates a slice of p9 mount data.
-func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
+func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
@@ -166,7 +176,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
 		// enablement.
 		opts = append(opts, "privateunixsocket=true")
 	}
-	if fa == FileAccessShared {
+	if fa == config.FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
 	}
 	return opts
@@ -251,7 +261,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
 
 // addSubmountOverlay overlays the inode over a ramfs tree containing the given
 // paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
 	// Construct a ramfs tree of mount points. The contents never
 	// change, so this can be fully caching. There's no real
 	// filesystem backing this tree, so we set the filesystem to
@@ -261,7 +271,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 	if err != nil {
 		return nil, fmt.Errorf("creating mount tree: %v", err)
 	}
-	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding mount overlay: %v", err)
 	}
@@ -280,7 +290,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
 	if conf.VFS2 {
 		return setupContainerVFS2(ctx, conf, mntr, procArgs)
 	}
@@ -293,11 +303,11 @@ func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter,
 	procArgs.MountNamespace = mns
 
 	// Resolve the executable path from working dir and environment.
-	f, err := user.ResolveExecutablePath(ctx, procArgs.Credentials, procArgs.MountNamespace, procArgs.Envv, procArgs.WorkingDirectory, procArgs.Argv[0])
+	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
 	if err != nil {
-		return fmt.Errorf("searching for executable %q, cwd: %q, envv: %q: %v", procArgs.Argv[0], procArgs.WorkingDirectory, procArgs.Envv, err)
+		return err
 	}
-	procArgs.Filename = f
+	procArgs.Filename = resolved
 	return nil
 }
 
@@ -318,14 +328,14 @@ func adjustDirentCache(k *kernel.Kernel) error {
 }
 
 type fdDispenser struct {
-	fds []int
+	fds []*fd.FD
 }
 
 func (f *fdDispenser) remove() int {
 	if f.empty() {
 		panic("fdDispenser out of fds")
 	}
-	rv := f.fds[0]
+	rv := f.fds[0].Release()
 	f.fds = f.fds[1:]
 	return rv
 }
@@ -390,6 +400,10 @@ type mountHint struct {
 	// root is the inode where the volume is mounted. For mounts with 'pod' share
 	// the volume is mounted once and then bind mounted inside the containers.
 	root *fs.Inode
+
+	// vfsMount is the master mount for the volume. For mounts with 'pod' share
+	// the master volume is bind mounted inside the containers.
+	vfsMount *vfs.Mount
 }
 
 func (m *mountHint) setField(key, val string) error {
@@ -447,27 +461,27 @@ func (m *mountHint) isSupported() bool {
 func (m *mountHint) checkCompatible(mount specs.Mount) error {
 	// Remove options that don't affect to mount's behavior.
 	masterOpts := filterUnsupportedOptions(m.mount)
-	slaveOpts := filterUnsupportedOptions(mount)
+	replicaOpts := filterUnsupportedOptions(mount)
 
-	if len(masterOpts) != len(slaveOpts) {
-		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	if len(masterOpts) != len(replicaOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 	}
 
 	sort.Strings(masterOpts)
-	sort.Strings(slaveOpts)
+	sort.Strings(replicaOpts)
 	for i, opt := range masterOpts {
-		if opt != slaveOpts[i] {
-			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		if opt != replicaOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 		}
 	}
 	return nil
 }
 
-func (m *mountHint) fileAccessType() FileAccessType {
+func (m *mountHint) fileAccessType() config.FileAccessType {
 	if m.share == container {
-		return FileAccessExclusive
+		return config.FileAccessExclusive
 	}
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 func filterUnsupportedOptions(mount specs.Mount) []string {
@@ -558,7 +572,7 @@ type containerMounter struct {
 	hints *podMountHints
 }
 
-func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter {
 	return &containerMounter{
 		root:   spec.Root,
 		mounts: compileMounts(spec),
@@ -571,9 +585,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // processHints processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHints(conf *Config) error {
+func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
 	if conf.VFS2 {
-		return nil
+		return c.processHintsVFS2(conf, creds)
 	}
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
@@ -595,7 +609,7 @@ func (c *containerMounter) processHints(conf *Config) error {
 // setupFS is used to set up the file system for all containers. This is the
 // main entry point method, with most of the other being internal only. It
 // returns the mount namespace that is created for the container.
-func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
 	log.Infof("Configuring container's file system")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -621,7 +635,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) {
 	rootInode, err := c.createRootMount(ctx, conf)
 	if err != nil {
 		return nil, fmt.Errorf("creating filesystem for container: %v", err)
@@ -633,9 +647,9 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error {
 	root := mns.Root()
-	defer root.DecRef()
+	defer root.DecRef(ctx)
 
 	for _, m := range c.mounts {
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
@@ -669,7 +683,7 @@ func (c *containerMounter) checkDispenser() error {
 
 // mountSharedMaster mounts the master of a volume that is shared among
 // containers in a pod. It returns the root mount's inode.
-func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
@@ -709,7 +723,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config,
 }
 
 // createRootMount creates the root filesystem.
-func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
 	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
 
@@ -734,7 +748,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
 	// mounted even if they are not in the spec.
 	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding submount overlay: %v", err)
 	}
@@ -754,7 +768,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 
 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) {
 	var (
 		fsName     string
 		opts       []string
@@ -788,19 +802,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, nil
 }
 
-func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType {
 	if hint := c.hints.findMount(mount); hint != nil {
 		return hint.fileAccessType()
 	}
 	// Non-root bind mounts are always shared if no hints were provided.
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
 // 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
@@ -844,7 +858,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 	submounts := subtargets(m.Destination, c.mounts)
 	if len(submounts) > 0 {
 		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
 		if err != nil {
 			return fmt.Errorf("adding submount overlay: %v", err)
 		}
@@ -863,7 +877,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 	if err != nil {
 		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
 	}
-	defer dirent.DecRef()
+	defer dirent.DecRef(ctx)
 	if err := mns.Mount(ctx, dirent, inode); err != nil {
 		return fmt.Errorf("mount %q error: %v", m.Destination, err)
 	}
@@ -884,12 +898,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 	if err != nil {
 		return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
 	}
-	defer target.DecRef()
+	defer target.DecRef(ctx)
 
 	// Take a ref on the inode that is about to be (re)-mounted.
 	source.root.IncRef()
 	if err := mns.Mount(ctx, target, source.root); err != nil {
-		source.root.DecRef()
+		source.root.DecRef(ctx)
 		return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
 	}
 
@@ -899,7 +913,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
-func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
 	if err != nil {
 		return err
@@ -924,7 +938,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
 
 // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
 // the mounts to the environment.
-func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) {
 	renv := &fs.RestoreEnvironment{
 		MountSources: make(map[string][]fs.MountArgs),
 	}
@@ -979,7 +993,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error {
 	for _, m := range c.mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
@@ -992,12 +1006,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
 	switch err {
 	case nil:
 		// Found '/tmp' in filesystem, check if it's empty.
-		defer tmp.DecRef()
+		defer tmp.DecRef(ctx)
 		f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
 		if err != nil {
 			return err
 		}
-		defer f.DecRef()
+		defer f.DecRef(ctx)
 		serializer := &fs.CollectEntriesSerializer{}
 		if err := f.Readdir(ctx, serializer); err != nil {
 			return err
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 912037075..e986231e5 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -20,6 +20,7 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func TestPodMountHintsHappy(t *testing.T) {
@@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) {
 	for _, tst := range []struct {
 		name        string
 		annotations map[string]string
-		want        FileAccessType
+		want        config.FileAccessType
 	}{
 		{
 			name: "container=exclusive",
@@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessExclusive,
+			want: config.FileAccessExclusive,
 		},
 		{
 			name: "pod=shared",
@@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "pod",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "shared=shared",
@@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "shared",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "default=shared",
@@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 	} {
 		t.Run(tst.name, func(t *testing.T) {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f802bc9fb..8c6ab213d 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,22 +16,26 @@
 package boot
 
 import (
+	"errors"
 	"fmt"
 	mrand "math/rand"
 	"os"
 	"runtime"
 	"sync/atomic"
-	"syscall"
 	gtime "time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
@@ -66,7 +70,9 @@ import (
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/specutils/seccomp"
 
 	// Include supported socket providers.
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
@@ -77,6 +83,22 @@ import (
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
 
+type containerInfo struct {
+	conf *config.Config
+
+	// spec is the base configuration for the root container.
+	spec *specs.Spec
+
+	// procArgs refers to the container's init task.
+	procArgs kernel.CreateProcessArgs
+
+	// stdioFDs contains stdin, stdout, and stderr.
+	stdioFDs []*fd.FD
+
+	// goferFDs are the FDs that attach the sandbox to the gofers.
+	goferFDs []*fd.FD
+}
+
 // Loader keeps state needed to start the kernel and run the container..
 type Loader struct {
 	// k is the kernel.
@@ -85,22 +107,11 @@ type Loader struct {
 	// ctrl is the control server.
 	ctrl *controller
 
-	conf *Config
-
-	// console is set to true if terminal is enabled.
-	console bool
+	// root contains information about the root container in the sandbox.
+	root containerInfo
 
 	watchdog *watchdog.Watchdog
 
-	// stdioFDs contains stdin, stdout, and stderr.
-	stdioFDs []int
-
-	// goferFDs are the FDs that attach the sandbox to the gofers.
-	goferFDs []int
-
-	// spec is the base configuration for the root container.
-	spec *specs.Spec
-
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
@@ -108,9 +119,6 @@ type Loader struct {
 	// restore is set to true if we are restoring a container.
 	restore bool
 
-	// rootProcArgs refers to the root sandbox init task.
-	rootProcArgs kernel.CreateProcessArgs
-
 	// sandboxID is the ID for the whole sandbox.
 	sandboxID string
 
@@ -162,7 +170,7 @@ type Args struct {
 	// Spec is the sandbox specification.
 	Spec *specs.Spec
 	// Conf is the system configuration.
-	Conf *Config
+	Conf *config.Config
 	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
 	// of this FD and may close it at any time.
 	ControllerFD int
@@ -175,8 +183,6 @@ type Args struct {
 	// StdioFDs is the stdio for the application. The Loader takes ownership of
 	// these FDs and may close them at any time.
 	StdioFDs []int
-	// Console is set to true if using TTY.
-	Console bool
 	// NumCPU is the number of CPUs to create inside the sandbox.
 	NumCPU int
 	// TotalMem is the initial amount of total memory to report back to the
@@ -187,7 +193,7 @@ type Args struct {
 }
 
 // make sure stdioFDs are always the same on initial start and on restore
-const startingStdioFD = 64
+const startingStdioFD = 256
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
@@ -205,6 +211,10 @@ func New(args Args) (*Loader, error) {
 	// Is this a VFSv2 kernel?
 	if args.Conf.VFS2 {
 		kernel.VFS2Enabled = true
+		if args.Conf.FUSE {
+			kernel.FUSEEnabled = true
+		}
+
 		vfs2.Override()
 	}
 
@@ -227,9 +237,7 @@ func New(args Args) (*Loader, error) {
 	// Create VDSO.
 	//
 	// Pass k as the platform since it is savable, unlike the actual platform.
-	//
-	// FIXME(b/109889800): Use non-nil context.
-	vdso, err := loader.PrepareVDSO(nil, k)
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("creating vdso: %v", err)
 	}
@@ -275,6 +283,7 @@ func New(args Args) (*Loader, error) {
 		args.NumCPU = runtime.NumCPU()
 	}
 	log.Infof("CPUs: %d", args.NumCPU)
+	runtime.GOMAXPROCS(args.NumCPU)
 
 	if args.TotalMem > 0 {
 		// Adjust the total memory returned by the Sentry so that applications that
@@ -300,6 +309,12 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
+	if kernel.VFS2Enabled {
+		if err := registerFilesystems(k); err != nil {
+			return nil, fmt.Errorf("registering filesystems: %w", err)
+		}
+	}
+
 	if err := adjustDirentCache(k); err != nil {
 		return nil, err
 	}
@@ -318,7 +333,7 @@ func New(args Args) (*Loader, error) {
 	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
 	dog := watchdog.New(k, dogOpts)
 
-	procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+	procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
 	if err != nil {
 		return nil, fmt.Errorf("creating init process for root container: %v", err)
 	}
@@ -338,7 +353,7 @@ func New(args Args) (*Loader, error) {
 		if err != nil {
 			return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
 		}
-		defer hostFilesystem.DecRef()
+		defer hostFilesystem.DecRef(k.SupervisorContext())
 		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
 		if err != nil {
 			return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
@@ -346,37 +361,45 @@ func New(args Args) (*Loader, error) {
 		k.SetHostMount(hostMount)
 	}
 
+	info := containerInfo{
+		conf:     args.Conf,
+		spec:     args.Spec,
+		procArgs: procArgs,
+	}
+
 	// Make host FDs stable between invocations. Host FDs must map to the exact
 	// same number when the sandbox is restored. Otherwise the wrong FD will be
 	// used.
-	var stdioFDs []int
 	newfd := startingStdioFD
-	for _, fd := range args.StdioFDs {
-		err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
-		if err != nil {
-			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+	for _, stdioFD := range args.StdioFDs {
+		// Check that newfd is unused to avoid clobbering over it.
+		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+			if err != nil {
+				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+			}
+			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
 		}
-		stdioFDs = append(stdioFDs, newfd)
-		err = syscall.Close(fd)
+
+		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
 		if err != nil {
-			return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
 		}
+		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+		_ = unix.Close(stdioFD)
 		newfd++
 	}
+	for _, goferFD := range args.GoferFDs {
+		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+	}
 
 	eid := execID{cid: args.ID}
 	l := &Loader{
-		k:            k,
-		conf:         args.Conf,
-		console:      args.Console,
-		watchdog:     dog,
-		spec:         args.Spec,
-		goferFDs:     args.GoferFDs,
-		stdioFDs:     stdioFDs,
-		rootProcArgs: procArgs,
-		sandboxID:    args.ID,
-		processes:    map[execID]*execProcess{eid: {}},
-		mountHints:   mountHints,
+		k:          k,
+		watchdog:   dog,
+		sandboxID:  args.ID,
+		processes:  map[execID]*execProcess{eid: {}},
+		mountHints: mountHints,
+		root:       info,
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -404,8 +427,8 @@ func New(args Args) (*Loader, error) {
 	return l, nil
 }
 
-// newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+// createProcessArgs creates args that can be used with kernel.CreateProcess.
+func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
@@ -449,9 +472,29 @@ func (l *Loader) Destroy() {
 		l.stopSignalForwarding()
 	}
 	l.watchdog.Stop()
+
+	// Release all kernel resources. This is only safe after we can no longer
+	// save/restore.
+	l.k.Release()
+
+	// All sentry-created resources should have been released at this point;
+	// check for reference leaks.
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.DoLeakCheck()
+	}
+
+	// In the success case, stdioFDs and goferFDs will only contain
+	// released/closed FDs that ownership has been passed over to host FDs and
+	// gofer sessions. Close them here in case of failure.
+	for _, fd := range l.root.stdioFDs {
+		_ = fd.Close()
+	}
+	for _, fd := range l.root.goferFDs {
+		_ = fd.Close()
+	}
 }
 
-func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
 	p, err := platform.Lookup(conf.Platform)
 	if err != nil {
 		panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
@@ -478,14 +521,15 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	return mf, nil
 }
 
+// installSeccompFilters installs sandbox seccomp filters with the host.
 func (l *Loader) installSeccompFilters() error {
-	if l.conf.DisableSeccomp {
+	if l.root.conf.DisableSeccomp {
 		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
 	} else {
 		opts := filter.Options{
 			Platform:      l.k.Platform,
-			HostNetwork:   l.conf.Network == NetworkHost,
-			ProfileEnable: l.conf.ProfileEnable,
+			HostNetwork:   l.root.conf.Network == config.NetworkHost,
+			ProfileEnable: l.root.conf.ProfileEnable,
 			ControllerFD:  l.ctrl.srv.FD(),
 		}
 		if err := filter.Install(opts); err != nil {
@@ -511,7 +555,7 @@ func (l *Loader) Run() error {
 }
 
 func (l *Loader) run() error {
-	if l.conf.Network == NetworkHost {
+	if l.root.conf.Network == config.NetworkHost {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
@@ -532,10 +576,8 @@ func (l *Loader) run() error {
 
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
-	var ttyFile *host.TTYFileOperations
-	var ttyFileVFS2 *hostvfs2.TTYFileDescription
 	if !l.restore {
-		if l.conf.ProfileEnable {
+		if l.root.conf.ProfileEnable {
 			pprof.Initialize()
 		}
 
@@ -545,82 +587,30 @@ func (l *Loader) run() error {
 			return err
 		}
 
-		// Create the FD map, which will set stdin, stdout, and stderr.  If console
-		// is true, then ioctl calls will be passed through to the host fd.
-		ctx := l.rootProcArgs.NewContext(l.k)
-		var err error
-
-		// CreateProcess takes a reference on FDMap if successful. We won't need
-		// ours either way.
-		l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
-		if err != nil {
-			return fmt.Errorf("importing fds: %v", err)
-		}
-
-		// Setup the root container file system.
-		l.startGoferMonitor(l.sandboxID, l.goferFDs)
-
-		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
-		if err := mntr.processHints(l.conf); err != nil {
-			return err
-		}
-		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
-			return err
-		}
-
-		// Add the HOME enviroment variable if it is not already set.
-		var envv []string
-		if kernel.VFS2Enabled {
-			envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
-				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-
-		} else {
-			envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
-				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-		}
-		if err != nil {
-			return err
-		}
-		l.rootProcArgs.Envv = envv
-
 		// Create the root container init task. It will begin running
 		// when the kernel is started.
-		if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
-			return fmt.Errorf("creating init process: %v", err)
+		if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+			return err
 		}
 
-		// CreateProcess takes a reference on FDTable if successful.
-		l.rootProcArgs.FDTable.DecRef()
 	}
 
 	ep.tg = l.k.GlobalInit()
-	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
 		ep.pidnsPath = ns.Path
 	}
-	if l.console {
-		// Set the foreground process group on the TTY to the global init process
-		// group, since that is what we are about to start running.
-		switch {
-		case ttyFileVFS2 != nil:
-			ep.ttyVFS2 = ttyFileVFS2
-			ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
-		case ttyFile != nil:
-			ep.tty = ttyFile
-			ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
-		}
-	}
 
 	// Handle signals by forwarding them to the root container process
 	// (except for panic signal, which should cause a panic).
 	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
 		// Panic signal should cause a panic.
-		if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+		if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
 			panic("Signal-induced panic")
 		}
 
 		// Otherwise forward to root container.
 		deliveryMode := DeliverToProcess
-		if l.console {
+		if l.root.spec.Process.Terminal {
 			// Since we are running with a console, we should forward the signal to
 			// the foreground process group so that job control signals like ^C can
 			// be handled properly.
@@ -632,19 +622,6 @@ func (l *Loader) run() error {
 		}
 	})
 
-	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
-	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
-	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
-	// passed FDs, so only close for VFS1.
-	if !kernel.VFS2Enabled {
-		for _, fd := range l.stdioFDs {
-			err := syscall.Close(fd)
-			if err != nil {
-				return fmt.Errorf("close dup()ed stdioFDs: %v", err)
-			}
-		}
-	}
-
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
@@ -664,9 +641,9 @@ func (l *Loader) createContainer(cid string) error {
 }
 
 // startContainer starts a child container. It returns the thread group ID of
-// the newly created process. Caller owns 'files' and may close them after
-// this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+// the newly created process. Used FDs are either closed or released. It's safe
+// for the caller to close any remaining files upon return.
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -676,8 +653,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	eid := execID{cid: cid}
-	if _, ok := l.processes[eid]; !ok {
+	ep := l.processes[execID{cid: cid}]
+	if ep == nil {
 		return fmt.Errorf("trying to start a deleted container %q", cid)
 	}
 
@@ -711,73 +688,136 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 		if pidns == nil {
 			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
 		}
-		l.processes[eid].pidnsPath = ns.Path
+		ep.pidnsPath = ns.Path
 	} else {
 		pidns = l.k.RootPIDNamespace()
 	}
-	procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+
+	info := &containerInfo{
+		conf:     conf,
+		spec:     spec,
+		stdioFDs: files[:3],
+		goferFDs: files[3:],
+	}
+	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
 	if err != nil {
 		return fmt.Errorf("creating new process: %v", err)
 	}
+	tg, err := l.createContainerProcess(false, cid, info, ep)
+	if err != nil {
+		return err
+	}
+
+	// Success!
+	l.k.StartProcess(tg)
+	ep.tg = tg
+	return nil
+}
 
-	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
-	var stdioFDs []int
-	for _, f := range files[:3] {
-		stdioFDs = append(stdioFDs, int(f.Fd()))
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
+	console := false
+	if root {
+		// Only root container supports terminal for now.
+		console = info.spec.Process.Terminal
 	}
 
 	// Create the FD map, which will set stdin, stdout, and stderr.
-	ctx := procArgs.NewContext(l.k)
-	fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
+	ctx := info.procArgs.NewContext(l.k)
+	fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
 	if err != nil {
-		return fmt.Errorf("importing fds: %v", err)
+		return nil, fmt.Errorf("importing fds: %v", err)
 	}
-	// CreateProcess takes a reference on fdTable if successful. We won't
-	// need ours either way.
-	procArgs.FDTable = fdTable
+	// CreateProcess takes a reference on fdTable if successful. We won't need
+	// ours either way.
+	info.procArgs.FDTable = fdTable
 
-	// Can't take ownership away from os.File. dup them to get a new FDs.
-	var goferFDs []int
-	for _, f := range files[3:] {
-		fd, err := syscall.Dup(int(f.Fd()))
-		if err != nil {
-			return fmt.Errorf("failed to dup file: %v", err)
+	// Setup the child container file system.
+	l.startGoferMonitor(cid, info.goferFDs)
+
+	mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
+	if root {
+		if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
+			return nil, err
 		}
-		goferFDs = append(goferFDs, fd)
+	}
+	if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
+		return nil, err
 	}
 
-	// Setup the child container file system.
-	l.startGoferMonitor(cid, goferFDs)
+	// Add the HOME environment variable if it is not already set.
+	var envv []string
+	if kernel.VFS2Enabled {
+		envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
+			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
 
-	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
-	if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
-		return err
+	} else {
+		envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
+			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
+	}
+	if err != nil {
+		return nil, err
 	}
+	info.procArgs.Envv = envv
 
 	// Create and start the new process.
-	tg, _, err := l.k.CreateProcess(procArgs)
+	tg, _, err := l.k.CreateProcess(info.procArgs)
 	if err != nil {
-		return fmt.Errorf("creating process: %v", err)
+		return nil, fmt.Errorf("creating process: %v", err)
 	}
-	l.k.StartProcess(tg)
-
 	// CreateProcess takes a reference on FDTable if successful.
-	procArgs.FDTable.DecRef()
+	info.procArgs.FDTable.DecRef(ctx)
 
-	l.processes[eid].tg = tg
-	return nil
+	// Set the foreground process group on the TTY to the global init process
+	// group, since that is what we are about to start running.
+	if root {
+		switch {
+		case ttyFileVFS2 != nil:
+			ep.ttyVFS2 = ttyFileVFS2
+			ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+		case ttyFile != nil:
+			ep.tty = ttyFile
+			ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
+		}
+	}
+
+	// Install seccomp filters with the new task if there are any.
+	if info.conf.OCISeccomp {
+		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
+			if err != nil {
+				return nil, fmt.Errorf("building seccomp program: %v", err)
+			}
+
+			if log.IsLogging(log.Debug) {
+				out, _ := bpf.DecodeProgram(program)
+				log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
+			}
+
+			task := tg.Leader()
+			// NOTE: It seems Flags are ignored by runc so we ignore them too.
+			if err := task.AppendSyscallFilter(program, true); err != nil {
+				return nil, fmt.Errorf("appending seccomp filters: %v", err)
+			}
+		}
+	} else {
+		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+			log.Warningf("Seccomp spec is being ignored")
+		}
+	}
+
+	return tg, nil
 }
 
 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and destroys the container if a
+// the gofer FDs looking for disconnects, and kills the container processes if a
 // disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
 	go func() {
 		log.Debugf("Monitoring gofer health for container %q", cid)
 		var events []unix.PollFd
-		for _, fd := range goferFDs {
+		for _, goferFD := range goferFDs {
 			events = append(events, unix.PollFd{
-				Fd:     int32(fd),
+				Fd:     int32(goferFD.FD()),
 				Events: unix.POLLHUP | unix.POLLRDHUP,
 			})
 		}
@@ -790,18 +830,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
 			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
 		}
 
-		// Check if the gofer has stopped as part of normal container destruction.
-		// This is done just to avoid sending an annoying error message to the log.
-		// Note that there is a small race window in between mu.Unlock() and the
-		// lock being reacquired in destroyContainer(), but it's harmless to call
-		// destroyContainer() multiple times.
 		l.mu.Lock()
-		_, ok := l.processes[execID{cid: cid}]
-		l.mu.Unlock()
-		if ok {
-			log.Infof("Gofer socket disconnected, destroying container %q", cid)
-			if err := l.destroyContainer(cid); err != nil {
-				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+		defer l.mu.Unlock()
+
+		// The gofer could have been stopped due to a normal container shutdown.
+		// Check if the container has not stopped yet.
+		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
+			log.Infof("Gofer socket disconnected, killing container %q", cid)
+			if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+				log.Warningf("Error killing container %q after gofer stopped: %v", cid, err)
 			}
 		}
 	}()
@@ -870,37 +907,41 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
-	// Get the container MountNamespace from the Task.
+	// Get the container MountNamespace from the Task. Try to acquire ref may fail
+	// in case it raced with task exit.
 	if kernel.VFS2Enabled {
-		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		// task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves.
 		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
-		args.MountNamespaceVFS2.IncRef()
+		if !args.MountNamespaceVFS2.TryIncRef() {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	} else {
+		var reffed bool
 		tg.Leader().WithMuLocked(func(t *kernel.Task) {
 			// task.MountNamespace() does not take a ref, so we must do so ourselves.
 			args.MountNamespace = t.MountNamespace()
-			args.MountNamespace.IncRef()
+			reffed = args.MountNamespace.TryIncRef()
 		})
+		if !reffed {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	}
 
 	// Add the HOME environment variable if it is not already set.
 	if kernel.VFS2Enabled {
-		defer args.MountNamespaceVFS2.DecRef()
-
 		root := args.MountNamespaceVFS2.Root()
-		defer root.DecRef()
 		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+		defer args.MountNamespaceVFS2.DecRef(ctx)
 		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
 		if err != nil {
 			return 0, err
 		}
 		args.Envv = envv
 	} else {
-		defer args.MountNamespace.DecRef()
-
 		root := args.MountNamespace.Root()
-		defer root.DecRef()
 		ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+		defer args.MountNamespace.DecRef(ctx)
+		defer root.DecRef(ctx)
 		envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
 		if err != nil {
 			return 0, err
@@ -997,20 +1038,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	// Wait for container.
 	l.k.WaitExited()
 
+	// Cleanup
+	l.ctrl.stop()
+
+	refs.OnExit()
+
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
 	// Create an empty network stack because the network namespace may be empty at
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
 	switch conf.Network {
-	case NetworkHost:
+	case config.NetworkHost:
 		// No network namespacing support for hostinet yet, hence creator is nil.
 		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
-	case NetworkNone, NetworkSandbox:
+	case config.NetworkNone, config.NetworkSandbox:
 		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
 		if err != nil {
 			return nil, err
@@ -1028,8 +1074,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni
 }
 
 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
-	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
+	transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
 	s := netstack.Stack{stack.New(stack.Options{
 		NetworkProtocols:   netProtos,
 		TransportProtocols: transProtos,
@@ -1043,21 +1089,32 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
 	})}
 
 	// Enable SACK Recovery.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-		return nil, fmt.Errorf("failed to enable SACK: %v", err)
+	{
+		opt := tcpip.TCPSACKEnabled(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Set default TTLs as required by socket/netstack.
-	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	{
+		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+		}
+		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+		}
+	}
 
 	// Enable Receive Buffer Auto-Tuning.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
-	s.FillDefaultIPTables()
-
 	return &s, nil
 }
 
@@ -1251,7 +1308,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
 	return ep.tty, ep.ttyVFS2, nil
 }
 
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	if len(stdioFDs) != 3 {
 		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
@@ -1260,7 +1317,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 	fdTable := k.NewFDTable()
 	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
 	if err != nil {
-		fdTable.DecRef()
+		fdTable.DecRef(ctx)
 		return nil, nil, nil, err
 	}
 	return fdTable, ttyFile, ttyFileVFS2, nil
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e448fd773..b77b4762e 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,6 +26,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
@@ -34,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 )
 
@@ -43,15 +45,19 @@ func init() {
 	if err := fsgofer.OpenProcSelfFD(); err != nil {
 		panic(err)
 	}
+	config.RegisterFlags()
 }
 
-func testConfig() *Config {
-	return &Config{
-		RootDir:        "unused_root_dir",
-		Network:        NetworkNone,
-		DisableSeccomp: true,
-		Platform:       "ptrace",
+func testConfig() *config.Config {
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		panic(err)
 	}
+	// Change test defaults.
+	conf.RootDir = "unused_root_dir"
+	conf.Network = config.NetworkNone
+	conf.DisableSeccomp = true
+	return conf
 }
 
 // testSpec returns a simple spec that can be used in tests.
@@ -258,9 +264,9 @@ type CreateMountTestcase struct {
 	expectedPaths []string
 }
 
-func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+func createMountTestcases() []*CreateMountTestcase {
 	testCases := []*CreateMountTestcase{
-		&CreateMountTestcase{
+		{
 			// Only proc.
 			name: "only proc mount",
 			spec: specs.Spec{
@@ -298,11 +304,10 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 					},
 				},
 			},
-			// /some/deep/path should be mounted, along with /proc,
-			// /dev, and /sys.
+			// /some/deep/path should be mounted, along with /proc, /dev, and /sys.
 			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		&CreateMountTestcase{
+		{
 			// Mounts are nested inside each other.
 			name: "nested mounts",
 			spec: specs.Spec{
@@ -346,7 +351,7 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
 				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		&CreateMountTestcase{
+		{
 			name: "mount inside /dev",
 			spec: specs.Spec{
 				Root: &specs.Root{
@@ -389,46 +394,42 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 			},
 			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
 		},
-	}
-
-	vfsCase := &CreateMountTestcase{
-		name: "mounts inside mandatory mounts",
-		spec: specs.Spec{
-			Root: &specs.Root{
-				Path:     os.TempDir(),
-				Readonly: true,
-			},
-			Mounts: []specs.Mount{
-				{
-					Destination: "/proc",
-					Type:        "tmpfs",
+		{
+			name: "mounts inside mandatory mounts",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
 				},
-				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
-				//  MkDirAt in VFS2 (and remove the reduntant append).
-				// {
-				//		Destination: "/sys/bar",
-				//		Type:        "tmpfs",
-				//	},
-				//
-				{
-					Destination: "/tmp/baz",
-					Type:        "tmpfs",
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/sys/bar",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/tmp/baz",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev/goo",
+						Type:        "tmpfs",
+					},
 				},
 			},
+			expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz", "/dev/goo"},
 		},
-		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
 	}
 
-	if !vfs2 {
-		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
-		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
-	}
-	return append(testCases, vfsCase)
+	return testCases
 }
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespace(t *testing.T) {
-	for _, tc := range createMountTestcases(false /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
@@ -439,7 +440,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+			mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{})
 			mns, err := mntr.createMountNamespace(ctx, conf)
 			if err != nil {
 				t.Fatalf("failed to create mount namespace: %v", err)
@@ -450,13 +451,13 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 
 			root := mns.Root()
-			defer root.DecRef()
+			defer root.DecRef(ctx)
 			for _, p := range tc.expectedPaths {
 				maxTraversals := uint(0)
 				if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
 				} else {
-					d.DecRef()
+					d.DecRef(ctx)
 				}
 			}
 		})
@@ -465,7 +466,7 @@ func TestCreateMountNamespace(t *testing.T) {
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespaceVFS2(t *testing.T) {
-	for _, tc := range createMountTestcases(true /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			spec := testSpec()
 			spec.Mounts = tc.spec.Mounts
@@ -479,19 +480,20 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			defer l.Destroy()
 			defer loaderCleanup()
 
-			mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
-			if err := mntr.processHints(l.conf); err != nil {
+			mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints)
+			if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil {
 				t.Fatalf("failed process hints: %v", err)
 			}
 
 			ctx := l.k.SupervisorContext()
-			mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+			mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
 			if err != nil {
-				t.Fatalf("failed to setupVFS2: %v", err)
+				t.Fatalf("mountAll: %v", err)
 			}
 
 			root := mns.Root()
-			defer root.DecRef()
+			root.IncRef()
+			defer root.DecRef(ctx)
 			for _, p := range tc.expectedPaths {
 				target := &vfs.PathOperation{
 					Root:  root,
@@ -499,10 +501,10 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 					Path:  fspath.Parse(p),
 				}
 
-				if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+				if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
 				} else {
-					d.DecRef()
+					d.DecRef(ctx)
 				}
 			}
 		})
@@ -545,7 +547,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -599,7 +601,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 						{
 							Dev:        "9pfs-/dev/fd-foo",
@@ -657,7 +659,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -697,7 +699,11 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+			var ioFDs []*fd.FD
+			for _, ioFD := range tc.ioFDs {
+				ioFDs = append(ioFDs, fd.New(ioFD))
+			}
+			mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{})
 			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0af30456e..988573640 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
 	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -32,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 var (
@@ -77,44 +79,6 @@ type DefaultRoute struct {
 	Name  string
 }
 
-// QueueingDiscipline is used to specify the kind of Queueing Discipline to
-// apply for a give FDBasedLink.
-type QueueingDiscipline int
-
-const (
-	// QDiscNone disables any queueing for the underlying FD.
-	QDiscNone QueueingDiscipline = iota
-
-	// QDiscFIFO applies a simple fifo based queue to the underlying
-	// FD.
-	QDiscFIFO
-)
-
-// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
-// else returns an error.
-func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
-	switch s {
-	case "none":
-		return QDiscNone, nil
-	case "fifo":
-		return QDiscFIFO, nil
-	default:
-		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
-	}
-}
-
-// String implements fmt.Stringer.
-func (q QueueingDiscipline) String() string {
-	switch q {
-	case QDiscNone:
-		return "none"
-	case QDiscFIFO:
-		return "fifo"
-	default:
-		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
-	}
-}
-
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
 	Name               string
@@ -123,8 +87,10 @@ type FDBasedLink struct {
 	Routes             []Route
 	GSOMaxSize         uint32
 	SoftwareGSOEnabled bool
+	TXChecksumOffload  bool
+	RXChecksumOffload  bool
 	LinkAddress        net.HardwareAddr
-	QDisc              QueueingDiscipline
+	QDisc              config.QueueingDiscipline
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -236,19 +202,23 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			PacketDispatchMode: fdbased.RecvMMsg,
 			GSOMaxSize:         link.GSOMaxSize,
 			SoftwareGSOEnabled: link.SoftwareGSOEnabled,
-			RXChecksumOffload:  true,
+			TXChecksumOffload:  link.TXChecksumOffload,
+			RXChecksumOffload:  link.RXChecksumOffload,
 		})
 		if err != nil {
 			return err
 		}
 
 		switch link.QDisc {
-		case QDiscNone:
-		case QDiscFIFO:
+		case config.QDiscNone:
+		case config.QDiscFIFO:
 			log.Infof("Enabling FIFO QDisc on %q", link.Name)
 			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
 		}
 
+		// Enable support for AF_PACKET sockets to receive outgoing packets.
+		linkEP = packetsocket.New(linkEP)
+
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
 		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index fbfd3b07c..c21648a32 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -15,10 +15,13 @@
 package boot
 
 import (
+	"strings"
+
 	"gvisor.dev/gvisor/pkg/sentry/strace"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
-func enableStrace(conf *Config) error {
+func enableStrace(conf *config.Config) error {
 	// We must initialize even if strace is not enabled.
 	strace.Initialize()
 
@@ -36,5 +39,5 @@ func enableStrace(conf *Config) error {
 		strace.EnableAll(strace.SinkTypeLog)
 		return nil
 	}
-	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+	return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog)
 }
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 6c84f0794..b157387ef 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,30 +16,40 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"sort"
 	"strings"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+	"gvisor.dev/gvisor/pkg/sentry/devices/ttydev"
+	"gvisor.dev/gvisor/pkg/sentry/devices/tundev"
 	"gvisor.dev/gvisor/pkg/sentry/fs/user"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
-func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+func registerFilesystems(k *kernel.Kernel) error {
+	ctx := k.SupervisorContext()
+	creds := auth.NewRootCredentials(k.RootUserNamespace())
+	vfsObj := k.VFS()
+
 	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserList: true,
 		// TODO(b/29356795): Users may mount this once the terminals are in a
@@ -53,6 +63,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
 	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserList: true,
 	})
+	vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
 	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 		AllowUserList:  true,
@@ -65,46 +79,78 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
 		AllowUserMount: true,
 		AllowUserList:  true,
 	})
+	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
 
 	// Setup files in devtmpfs.
 	if err := memdev.Register(vfsObj); err != nil {
 		return fmt.Errorf("registering memdev: %w", err)
 	}
+	if err := ttydev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering ttydev: %w", err)
+	}
+	tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
+	if tunSupported {
+		if err := tundev.Register(vfsObj); err != nil {
+			return fmt.Errorf("registering tundev: %v", err)
+		}
+	}
+
+	if kernel.FUSEEnabled {
+		if err := fuse.Register(vfsObj); err != nil {
+			return fmt.Errorf("registering fusedev: %w", err)
+		}
+	}
+
 	a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
 	if err != nil {
 		return fmt.Errorf("creating devtmpfs accessor: %w", err)
 	}
-	defer a.Release()
+	defer a.Release(ctx)
 
 	if err := a.UserspaceInit(ctx); err != nil {
 		return fmt.Errorf("initializing userspace: %w", err)
 	}
 	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
-		return fmt.Errorf("creating devtmpfs files: %w", err)
+		return fmt.Errorf("creating memdev devtmpfs files: %w", err)
+	}
+	if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
+	}
+	if tunSupported {
+		if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
+			return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+		}
 	}
+
+	if kernel.FUSEEnabled {
+		if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
+			return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+		}
+	}
+
 	return nil
 }
 
-func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
-	if err := mntr.k.VFS().Init(); err != nil {
-		return fmt.Errorf("failed to initialize VFS: %w", err)
-	}
-	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	mns, err := mntr.mountAll(conf, procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to setupFS: %w", err)
 	}
 	procArgs.MountNamespaceVFS2 = mns
 
 	// Resolve the executable path from working dir and environment.
-	f, err := user.ResolveExecutablePathVFS2(ctx, procArgs.Credentials, procArgs.MountNamespaceVFS2, procArgs.Envv, procArgs.WorkingDirectory, procArgs.Argv[0])
+	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
 	if err != nil {
-		return fmt.Errorf("searching for executable %q, cwd: %q, envv: %q: %v", procArgs.Argv[0], procArgs.WorkingDirectory, procArgs.Envv, err)
+		return err
 	}
-	procArgs.Filename = f
+	procArgs.Filename = resolved
 	return nil
 }
 
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
 	log.Infof("Configuring container's file system with VFS2")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -117,36 +163,147 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
 	rootCtx := procArgs.NewContext(c.k)
 
-	if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
-		return nil, fmt.Errorf("register filesystems: %w", err)
-	}
-
 	mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
 	if err != nil {
 		return nil, fmt.Errorf("creating mount namespace: %w", err)
 	}
 	rootProcArgs.MountNamespaceVFS2 = mns
 
+	root := mns.Root()
+	root.IncRef()
+	defer root.DecRef(rootCtx)
+	if root.Mount().ReadOnly() {
+		// Switch to ReadWrite while we setup submounts.
+		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
+			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
+		}
+		// Restore back to ReadOnly at the end.
+		defer func() {
+			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
+			}
+		}()
+	}
+
 	// Mount submounts.
 	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
 		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
 	}
+
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
 	fd := c.fds.remove()
-	opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
+	data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+
+	if conf.OverlayfsStaleRead {
+		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
+		// can only send mount options for specs.Mounts (specs.Root is missing
+		// Options field). So assume root is always on top of overlayfs.
+		data = append(data, "overlayfs_stale_read")
+	}
 
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
+	opts := &vfs.MountOptions{
+		ReadOnly: c.root.Readonly,
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+			InternalData: gofer.InternalFilesystemOptions{
+				UniqueID: "/",
+			},
+		},
+		InternalMount: true,
+	}
+
+	fsName := gofer.Name
+	if conf.Overlay && !c.root.Readonly {
+		log.Infof("Adding overlay on top of root")
+		var err error
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting root with overlay: %w", err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
 	if err != nil {
 		return nil, fmt.Errorf("setting up mount namespace: %w", err)
 	}
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
+// layer using tmpfs, and return overlay mount options. "cleanup" must be called
+// after the options have been used to mount the overlay, to release refs on
+// lower and upper mounts.
+func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
+	// First copy options from lower layer to upper layer and overlay. Clear
+	// filesystem specific options.
+	upperOpts := *lowerOpts
+	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+	overlayOpts := *lowerOpts
+	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+	// Next mount upper and lower. Upper is a tmpfs mount to keep all
+	// modifications inside the sandbox.
+	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+	}
+	cu := cleanup.Make(func() { upper.DecRef(ctx) })
+	defer cu.Clean()
+
+	// All writes go to the upper layer, be paranoid and make lower readonly.
+	lowerOpts.ReadOnly = true
+	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
+	if err != nil {
+		return nil, nil, err
+	}
+	cu.Add(func() { lower.DecRef(ctx) })
+
+	// Propagate the lower layer's root's owner, group, and mode to the upper
+	// layer's root for consistency with VFS1.
+	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
+	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
+	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerRootVD,
+		Start: lowerRootVD,
+	}, &vfs.StatOptions{
+		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE,
+	})
+	if err != nil {
+		return nil, nil, err
+	}
+	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
+		Root:  upperRootVD,
+		Start: upperRootVD,
+	}, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask,
+			UID:  stat.UID,
+			GID:  stat.GID,
+			Mode: stat.Mode,
+		},
+	})
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Configure overlay with both layers.
+	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
+		UpperRoot:  upperRootVD,
+		LowerRoots: []vfs.VirtualDentry{lowerRootVD},
+	}
+	return &overlayOpts, cu.Release(), nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
 	mounts, err := c.prepareMountsVFS2()
 	if err != nil {
 		return err
@@ -155,8 +312,34 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
 	for i := range mounts {
 		submount := &mounts[i]
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
-		if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
-			return err
+		var (
+			mnt *vfs.Mount
+			err error
+		)
+
+		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+			mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
+			if err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+			}
+		} else {
+			mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
+			if err != nil {
+				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+			}
+		}
+
+		if mnt != nil && mnt.ReadOnly() {
+			// Switch to ReadWrite while we setup submounts.
+			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
+				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
+			}
+			// Restore back to ReadOnly at the end.
+			defer func() {
+				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
+					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
+				}
+			}()
 		}
 	}
 
@@ -200,67 +383,94 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 	return mounts, nil
 }
 
-func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
-	root := mns.Root()
-	defer root.DecRef()
-	target := &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(submount.Destination),
-	}
-	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
 	if err != nil {
-		return fmt.Errorf("mountOptions failed: %w", err)
+		return nil, fmt.Errorf("mountOptions failed: %w", err)
 	}
 	if len(fsName) == 0 {
 		// Filesystem is not supported (e.g. cgroup), just skip it.
-		return nil
+		return nil, nil
 	}
 
-	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
-		return err
+	if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
 	}
-	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
-		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+
+	if useOverlay {
+		log.Infof("Adding overlay on top of mount %q", submount.Destination)
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
+	root := mns.Root()
+	root.IncRef()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(submount.Destination),
+	}
+	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
+	if err != nil {
+		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
 	}
 	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
-	return nil
+	return mnt, nil
 }
 
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
-	var (
-		fsName string
-		data   []string
-	)
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
+	fsName := m.Type
+	useOverlay := false
+	var data []string
+	var iopts interface{}
 
 	// Find filesystem name and FS specific data field.
 	switch m.Type {
 	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
-		fsName = m.Type
+		// Nothing to do.
+
 	case nonefs:
 		fsName = sys.Name
-	case tmpfs.Name:
-		fsName = m.Type
 
+	case tmpfs.Name:
 		var err error
 		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
-			return "", nil, err
+			return "", nil, false, err
 		}
 
 	case bind:
 		fsName = gofer.Name
+		if m.fd == 0 {
+			// Check that an FD was provided to fails fast. Technically FD=0 is valid,
+			// but unlikely to be correct in this context.
+			return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
+		}
 		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+		iopts = gofer.InternalFilesystemOptions{
+			UniqueID: m.Destination,
+		}
+
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+		return "", nil, false, nil
 	}
 
 	opts := &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
-			Data: strings.Join(data, ","),
+			Data:         strings.Join(data, ","),
+			InternalData: iopts,
 		},
 		InternalMount: true,
 	}
@@ -272,7 +482,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		case "ro":
 			opts.ReadOnly = true
 		case "noatime":
-			// TODO(gvisor.dev/issue/1193): Implement MS_NOATIME.
+			opts.Flags.NoATime = true
 		case "noexec":
 			opts.Flags.NoExec = true
 		default:
@@ -280,38 +490,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		}
 	}
 
-	if conf.Overlay {
-		// All writes go to upper, be paranoid and make lower readonly.
-		opts.ReadOnly = true
-	}
-	return fsName, opts, nil
-}
-
-func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
-	target := &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(currentPath),
-	}
-	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
-	if err == nil {
-		// Mount point exists, nothing else to do.
-		return nil
-	}
-	if err != syserror.ENOENT {
-		return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
-	}
-
-	// Recurse to ensure parent is created and then create the mount point.
-	if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
-		return err
-	}
-	log.Debugf("Creating dir %q for mount point", currentPath)
-	mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
-	if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
-		return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
-	}
-	return nil
+	return fsName, opts, useOverlay, nil
 }
 
 // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
@@ -323,7 +502,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
 	for _, m := range c.mounts {
 		// m.Destination has been cleaned, so it's to use equality here.
 		if m.Destination == "/tmp" {
@@ -333,28 +512,36 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 	}
 
 	root := mns.Root()
-	defer root.DecRef()
+	root.IncRef()
+	defer root.DecRef(ctx)
 	pop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
 		Path:  fspath.Parse("/tmp"),
 	}
 	// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
-	statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{})
+	fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
 	switch err {
 	case nil:
-		// Found '/tmp' in filesystem, check if it's empty.
-		if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory {
-			// Not a dir?! Leave it be.
+		defer fd.DecRef(ctx)
+
+		err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name != "." && dirent.Name != ".." {
+				return syserror.ENOTEMPTY
+			}
 			return nil
-		}
-		if statx.Nlink > 2 {
+		}))
+		switch err {
+		case nil:
+			log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+		case syserror.ENOTEMPTY:
 			// If more than "." and ".." is found, skip internal tmpfs to prevent
 			// hiding existing files.
 			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
 			return nil
+		default:
+			return err
 		}
-		log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
 		fallthrough
 
 	case syserror.ENOENT:
@@ -367,9 +554,142 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=01777"},
 		}
-		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		return err
+
+	case syserror.ENOTDIR:
+		// Not a dir?! Let it be.
+		return nil
 
 	default:
-		return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
+		return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
+	}
+}
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs.Name {
+			continue
+		}
+
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.vfsMount = mnt
+	}
+	return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	mntFD := &mountAndFD{Mount: hint.mount}
+	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+
+	if useOverlay {
+		log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
+	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
+	if err := source.checkCompatible(mount); err != nil {
+		return nil, err
+	}
+
+	// Ignore data and useOverlay because these were already applied to
+	// the master mount.
+	_, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+	if err != nil {
+		return nil, err
+	}
+	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+	if err != nil {
+		return nil, err
+	}
+	defer newMnt.DecRef(ctx)
+
+	root := mns.Root()
+	root.IncRef()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(mount.Destination),
+	}
+
+	if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
+	}
+
+	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+		return nil, err
+	}
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return newMnt, nil
+}
+
+func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
+	root := mns.Root()
+	root.IncRef()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dest),
+	}
+	// First check if mount point exists. When overlay is enabled, gofer doesn't
+	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
+	// because MkdirAt fails with EROFS even if file exists.
+	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
+	if err == nil {
+		// File exists, we're done.
+		vd.DecRef(ctx)
+		return nil
+	}
+	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
+}
+
+// configureRestore returns an updated context.Context including filesystem
+// state used by restore defined by conf.
+func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) {
+	fdmap := make(map[string]int)
+	fdmap["/"] = c.fds.remove()
+	mounts, err := c.prepareMountsVFS2()
+	if err != nil {
+		return ctx, err
+	}
+	for i := range c.mounts {
+		submount := &mounts[i]
+		if submount.fd >= 0 {
+			fdmap[submount.Destination] = submount.fd
+		}
 	}
+	return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil
 }
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index c087e1a3c..37f4253ba 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -10,7 +10,7 @@ go_library(
         "//pkg/cleanup",
         "//pkg/log",
         "@com_github_cenkalti_backoff//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
     ],
 )
 
@@ -20,4 +20,8 @@ go_test(
     srcs = ["cgroup_test.go"],
     library = ":cgroup",
     tags = ["local"],
+    deps = [
+        "//pkg/test/testutil",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+    ],
 )
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index ef01820ef..5bd0afc52 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -21,6 +21,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -43,6 +44,7 @@ var controllers = map[string]config{
 	"blkio":    config{ctrlr: &blockIO{}},
 	"cpu":      config{ctrlr: &cpu{}},
 	"cpuset":   config{ctrlr: &cpuSet{}},
+	"hugetlb":  config{ctrlr: &hugeTLB{}, optional: true},
 	"memory":   config{ctrlr: &memory{}},
 	"net_cls":  config{ctrlr: &networkClass{}},
 	"net_prio": config{ctrlr: &networkPrio{}},
@@ -52,7 +54,6 @@ var controllers = map[string]config{
 	// irrelevant for a sandbox.
 	"devices":    config{ctrlr: &noop{}},
 	"freezer":    config{ctrlr: &noop{}},
-	"hugetlb":    config{ctrlr: &noop{}, optional: true},
 	"perf_event": config{ctrlr: &noop{}},
 	"rdma":       config{ctrlr: &noop{}, optional: true},
 	"systemd":    config{ctrlr: &noop{}},
@@ -92,7 +93,17 @@ func setOptionalValueUint16(path, name string, val *uint16) error {
 
 func setValue(path, name, data string) error {
 	fullpath := filepath.Join(path, name)
-	return ioutil.WriteFile(fullpath, []byte(data), 0700)
+
+	// Retry writes on EINTR; see:
+	//    https://github.com/golang/go/issues/38033
+	for {
+		err := ioutil.WriteFile(fullpath, []byte(data), 0700)
+		if err == nil {
+			return nil
+		} else if !errors.Is(err, syscall.EINTR) {
+			return err
+		}
+	}
 }
 
 func getValue(path, name string) (string, error) {
@@ -125,15 +136,23 @@ func fillFromAncestor(path string) (string, error) {
 		return val, nil
 	}
 
-	// File is not set, recurse to parent and then  set here.
+	// File is not set, recurse to parent and then set here.
 	name := filepath.Base(path)
 	parent := filepath.Dir(filepath.Dir(path))
 	val, err = fillFromAncestor(filepath.Join(parent, name))
 	if err != nil {
 		return "", err
 	}
-	if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil {
-		return "", err
+
+	// Retry writes on EINTR; see:
+	//    https://github.com/golang/go/issues/38033
+	for {
+		err := ioutil.WriteFile(path, []byte(val), 0700)
+		if err == nil {
+			break
+		} else if !errors.Is(err, syscall.EINTR) {
+			return "", err
+		}
 	}
 	return val, nil
 }
@@ -180,16 +199,26 @@ func LoadPaths(pid string) (map[string]string, error) {
 	}
 	defer f.Close()
 
+	return loadPathsHelper(f)
+}
+
+func loadPathsHelper(cgroup io.Reader) (map[string]string, error) {
 	paths := make(map[string]string)
-	scanner := bufio.NewScanner(f)
+
+	scanner := bufio.NewScanner(cgroup)
 	for scanner.Scan() {
-		// Format: ID:controller1,controller2:path
+		// Format: ID:[name=]controller1,controller2:path
 		// Example: 2:cpu,cpuacct:/user.slice
 		tokens := strings.Split(scanner.Text(), ":")
 		if len(tokens) != 3 {
 			return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text())
 		}
+		if len(tokens[1]) == 0 {
+			continue
+		}
 		for _, ctrlr := range strings.Split(tokens[1], ",") {
+			// Remove prefix for cgroups with no controller, eg. systemd.
+			ctrlr = strings.TrimPrefix(ctrlr, "name=")
 			paths[ctrlr] = tokens[2]
 		}
 	}
@@ -219,7 +248,7 @@ func New(spec *specs.Spec) (*Cgroup, error) {
 		var err error
 		parents, err = LoadPaths("self")
 		if err != nil {
-			return nil, fmt.Errorf("finding current cgroups: %v", err)
+			return nil, fmt.Errorf("finding current cgroups: %w", err)
 		}
 	}
 	return &Cgroup{
@@ -258,10 +287,8 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 			}
 			return err
 		}
-		if res != nil {
-			if err := cfg.ctrlr.set(res, path); err != nil {
-				return err
-			}
+		if err := cfg.ctrlr.set(res, path); err != nil {
+			return err
 		}
 	}
 	clean.Release()
@@ -286,14 +313,15 @@ func (c *Cgroup) Uninstall() error {
 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 		defer cancel()
 		b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
-		if err := backoff.Retry(func() error {
+		fn := func() error {
 			err := syscall.Rmdir(path)
 			if os.IsNotExist(err) {
 				return nil
 			}
 			return err
-		}, b); err != nil {
-			return fmt.Errorf("removing cgroup path %q: %v", path, err)
+		}
+		if err := backoff.Retry(fn, b); err != nil {
+			return fmt.Errorf("removing cgroup path %q: %w", path, err)
 		}
 	}
 	return nil
@@ -314,7 +342,6 @@ func (c *Cgroup) Join() (func(), error) {
 		if _, ok := controllers[ctrlr]; ok {
 			fullPath := filepath.Join(cgroupRoot, ctrlr, path)
 			undoPaths = append(undoPaths, fullPath)
-			break
 		}
 	}
 
@@ -404,7 +431,7 @@ func (*noop) set(*specs.LinuxResources, string) error {
 type memory struct{}
 
 func (*memory) set(spec *specs.LinuxResources, path string) error {
-	if spec.Memory == nil {
+	if spec == nil || spec.Memory == nil {
 		return nil
 	}
 	if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil {
@@ -437,7 +464,7 @@ func (*memory) set(spec *specs.LinuxResources, path string) error {
 type cpu struct{}
 
 func (*cpu) set(spec *specs.LinuxResources, path string) error {
-	if spec.CPU == nil {
+	if spec == nil || spec.CPU == nil {
 		return nil
 	}
 	if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil {
@@ -446,7 +473,13 @@ func (*cpu) set(spec *specs.LinuxResources, path string) error {
 	if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil {
 		return err
 	}
-	return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period)
+	if err := setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period); err != nil {
+		return err
+	}
+	if err := setOptionalValueUint(path, "cpu.rt_period_us", spec.CPU.RealtimePeriod); err != nil {
+		return err
+	}
+	return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime)
 }
 
 type cpuSet struct{}
@@ -454,7 +487,7 @@ type cpuSet struct{}
 func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
 	// cpuset.cpus and mems are required fields, but are not set on a new cgroup.
 	// If not set in the spec, get it from one of the ancestors cgroup.
-	if spec.CPU == nil || spec.CPU.Cpus == "" {
+	if spec == nil || spec.CPU == nil || spec.CPU.Cpus == "" {
 		if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil {
 			return err
 		}
@@ -464,18 +497,17 @@ func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
 		}
 	}
 
-	if spec.CPU == nil || spec.CPU.Mems == "" {
+	if spec == nil || spec.CPU == nil || spec.CPU.Mems == "" {
 		_, err := fillFromAncestor(filepath.Join(path, "cpuset.mems"))
 		return err
 	}
-	mems := spec.CPU.Mems
-	return setValue(path, "cpuset.mems", mems)
+	return setValue(path, "cpuset.mems", spec.CPU.Mems)
 }
 
 type blockIO struct{}
 
 func (*blockIO) set(spec *specs.LinuxResources, path string) error {
-	if spec.BlockIO == nil {
+	if spec == nil || spec.BlockIO == nil {
 		return nil
 	}
 
@@ -487,13 +519,17 @@ func (*blockIO) set(spec *specs.LinuxResources, path string) error {
 	}
 
 	for _, dev := range spec.BlockIO.WeightDevice {
-		val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight)
-		if err := setValue(path, "blkio.weight_device", val); err != nil {
-			return err
+		if dev.Weight != nil {
+			val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight)
+			if err := setValue(path, "blkio.weight_device", val); err != nil {
+				return err
+			}
 		}
-		val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight)
-		if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
-			return err
+		if dev.LeafWeight != nil {
+			val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.LeafWeight)
+			if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
+				return err
+			}
 		}
 	}
 	if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil {
@@ -521,7 +557,7 @@ func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error {
 type networkClass struct{}
 
 func (*networkClass) set(spec *specs.LinuxResources, path string) error {
-	if spec.Network == nil {
+	if spec == nil || spec.Network == nil {
 		return nil
 	}
 	return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID)
@@ -530,7 +566,7 @@ func (*networkClass) set(spec *specs.LinuxResources, path string) error {
 type networkPrio struct{}
 
 func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
-	if spec.Network == nil {
+	if spec == nil || spec.Network == nil {
 		return nil
 	}
 	for _, prio := range spec.Network.Priorities {
@@ -545,9 +581,25 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
 type pids struct{}
 
 func (*pids) set(spec *specs.LinuxResources, path string) error {
-	if spec.Pids == nil {
+	if spec == nil || spec.Pids == nil || spec.Pids.Limit <= 0 {
 		return nil
 	}
 	val := strconv.FormatInt(spec.Pids.Limit, 10)
 	return setValue(path, "pids.max", val)
 }
+
+type hugeTLB struct{}
+
+func (*hugeTLB) set(spec *specs.LinuxResources, path string) error {
+	if spec == nil {
+		return nil
+	}
+	for _, limit := range spec.HugepageLimits {
+		name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize)
+		val := strconv.FormatUint(limit.Limit, 10)
+		if err := setValue(path, name, val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index 548c80e9a..9794517a7 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -15,7 +15,14 @@
 package cgroup
 
 import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
 	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 func TestUninstallEnoent(t *testing.T) {
@@ -65,3 +72,658 @@ func TestCountCpuset(t *testing.T) {
 		})
 	}
 }
+
+func uint16Ptr(v uint16) *uint16 {
+	return &v
+}
+
+func uint32Ptr(v uint32) *uint32 {
+	return &v
+}
+
+func int64Ptr(v int64) *int64 {
+	return &v
+}
+
+func uint64Ptr(v uint64) *uint64 {
+	return &v
+}
+
+func boolPtr(v bool) *bool {
+	return &v
+}
+
+func checkDir(t *testing.T, dir string, contents map[string]string) {
+	all, err := ioutil.ReadDir(dir)
+	if err != nil {
+		t.Fatalf("ReadDir(%q): %v", dir, err)
+	}
+	fileCount := 0
+	for _, file := range all {
+		if file.IsDir() {
+			// Only want to compare files.
+			continue
+		}
+		fileCount++
+
+		want, ok := contents[file.Name()]
+		if !ok {
+			t.Errorf("file not expected: %q", file.Name())
+			continue
+		}
+		gotBytes, err := ioutil.ReadFile(filepath.Join(dir, file.Name()))
+		if err != nil {
+			t.Fatal(err.Error())
+		}
+		got := strings.TrimSuffix(string(gotBytes), "\n")
+		if got != want {
+			t.Errorf("wrong file content, file: %q, want: %q, got: %q", file.Name(), want, got)
+		}
+	}
+	if fileCount != len(contents) {
+		t.Errorf("file is missing, want: %v, got: %v", contents, all)
+	}
+}
+
+func makeLinuxWeightDevice(major, minor int64, weight, leafWeight *uint16) specs.LinuxWeightDevice {
+	rv := specs.LinuxWeightDevice{
+		Weight:     weight,
+		LeafWeight: leafWeight,
+	}
+	rv.Major = major
+	rv.Minor = minor
+	return rv
+}
+
+func makeLinuxThrottleDevice(major, minor int64, rate uint64) specs.LinuxThrottleDevice {
+	rv := specs.LinuxThrottleDevice{
+		Rate: rate,
+	}
+	rv.Major = major
+	rv.Minor = minor
+	return rv
+}
+
+func TestBlockIO(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxBlockIO
+		wants map[string]string
+	}{
+		{
+			name: "simple",
+			spec: &specs.LinuxBlockIO{
+				Weight:     uint16Ptr(1),
+				LeafWeight: uint16Ptr(2),
+			},
+			wants: map[string]string{
+				"blkio.weight":      "1",
+				"blkio.leaf_weight": "2",
+			},
+		},
+		{
+			name: "weight_device",
+			spec: &specs.LinuxBlockIO{
+				WeightDevice: []specs.LinuxWeightDevice{
+					makeLinuxWeightDevice(1, 2, uint16Ptr(3), uint16Ptr(4)),
+				},
+			},
+			wants: map[string]string{
+				"blkio.weight_device":      "1:2 3",
+				"blkio.leaf_weight_device": "1:2 4",
+			},
+		},
+		{
+			name: "weight_device_nil_values",
+			spec: &specs.LinuxBlockIO{
+				WeightDevice: []specs.LinuxWeightDevice{
+					makeLinuxWeightDevice(1, 2, nil, nil),
+				},
+			},
+		},
+		{
+			name: "throttle",
+			spec: &specs.LinuxBlockIO{
+				ThrottleReadBpsDevice: []specs.LinuxThrottleDevice{
+					makeLinuxThrottleDevice(1, 2, 3),
+				},
+				ThrottleReadIOPSDevice: []specs.LinuxThrottleDevice{
+					makeLinuxThrottleDevice(4, 5, 6),
+				},
+				ThrottleWriteBpsDevice: []specs.LinuxThrottleDevice{
+					makeLinuxThrottleDevice(7, 8, 9),
+				},
+				ThrottleWriteIOPSDevice: []specs.LinuxThrottleDevice{
+					makeLinuxThrottleDevice(10, 11, 12),
+				},
+			},
+			wants: map[string]string{
+				"blkio.throttle.read_bps_device":   "1:2 3",
+				"blkio.throttle.read_iops_device":  "4:5 6",
+				"blkio.throttle.write_bps_device":  "7:8 9",
+				"blkio.throttle.write_iops_device": "10:11 12",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxBlockIO{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				BlockIO: tc.spec,
+			}
+			ctrlr := blockIO{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestCPU(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxCPU
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxCPU{
+				Shares:          uint64Ptr(1),
+				Quota:           int64Ptr(2),
+				Period:          uint64Ptr(3),
+				RealtimeRuntime: int64Ptr(4),
+				RealtimePeriod:  uint64Ptr(5),
+			},
+			wants: map[string]string{
+				"cpu.shares":        "1",
+				"cpu.cfs_quota_us":  "2",
+				"cpu.cfs_period_us": "3",
+				"cpu.rt_runtime_us": "4",
+				"cpu.rt_period_us":  "5",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxCPU{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				CPU: tc.spec,
+			}
+			ctrlr := cpu{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestCPUSet(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxCPU
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxCPU{
+				Cpus: "foo",
+				Mems: "bar",
+			},
+			wants: map[string]string{
+				"cpuset.cpus": "foo",
+				"cpuset.mems": "bar",
+			},
+		},
+		// Don't test nil values because they are copied from the parent.
+		// See TestCPUSetAncestor().
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				CPU: tc.spec,
+			}
+			ctrlr := cpuSet{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+// TestCPUSetAncestor checks that, when not available, value is read from
+// parent directory.
+func TestCPUSetAncestor(t *testing.T) {
+	// Prepare master directory with cgroup files that will be propagated to
+	// children.
+	grandpa, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+	if err != nil {
+		t.Fatalf("error creating temporary directory: %v", err)
+	}
+	defer os.RemoveAll(grandpa)
+
+	if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.cpus"), []byte("parent-cpus"), 0666); err != nil {
+		t.Fatalf("ioutil.WriteFile(): %v", err)
+	}
+	if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.mems"), []byte("parent-mems"), 0666); err != nil {
+		t.Fatalf("ioutil.WriteFile(): %v", err)
+	}
+
+	for _, tc := range []struct {
+		name string
+		spec *specs.LinuxCPU
+	}{
+		{
+			name: "nil_values",
+			spec: &specs.LinuxCPU{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			// Create empty files in intermediate directory. They should be ignored
+			// when reading, and then populated from parent.
+			parent, err := ioutil.TempDir(grandpa, "parent")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(parent)
+			if _, err := os.Create(filepath.Join(parent, "cpuset.cpus")); err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+			if _, err := os.Create(filepath.Join(parent, "cpuset.mems")); err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+
+			// cgroup files mmust exist.
+			dir, err := ioutil.TempDir(parent, "child")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			if _, err := os.Create(filepath.Join(dir, "cpuset.cpus")); err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+			if _, err := os.Create(filepath.Join(dir, "cpuset.mems")); err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+
+			spec := &specs.LinuxResources{
+				CPU: tc.spec,
+			}
+			ctrlr := cpuSet{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			want := map[string]string{
+				"cpuset.cpus": "parent-cpus",
+				"cpuset.mems": "parent-mems",
+			}
+			// Both path and dir must have been populated from grandpa.
+			checkDir(t, parent, want)
+			checkDir(t, dir, want)
+		})
+	}
+}
+
+func TestHugeTlb(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  []specs.LinuxHugepageLimit
+		wants map[string]string
+	}{
+		{
+			name: "single",
+			spec: []specs.LinuxHugepageLimit{
+				{
+					Pagesize: "1G",
+					Limit:    123,
+				},
+			},
+			wants: map[string]string{
+				"hugetlb.1G.limit_in_bytes": "123",
+			},
+		},
+		{
+			name: "multiple",
+			spec: []specs.LinuxHugepageLimit{
+				{
+					Pagesize: "1G",
+					Limit:    123,
+				},
+				{
+					Pagesize: "2G",
+					Limit:    456,
+				},
+				{
+					Pagesize: "1P",
+					Limit:    789,
+				},
+			},
+			wants: map[string]string{
+				"hugetlb.1G.limit_in_bytes": "123",
+				"hugetlb.2G.limit_in_bytes": "456",
+				"hugetlb.1P.limit_in_bytes": "789",
+			},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				HugepageLimits: tc.spec,
+			}
+			ctrlr := hugeTLB{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestMemory(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxMemory
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxMemory{
+				Limit:            int64Ptr(1),
+				Reservation:      int64Ptr(2),
+				Swap:             int64Ptr(3),
+				Kernel:           int64Ptr(4),
+				KernelTCP:        int64Ptr(5),
+				Swappiness:       uint64Ptr(6),
+				DisableOOMKiller: boolPtr(true),
+			},
+			wants: map[string]string{
+				"memory.limit_in_bytes":          "1",
+				"memory.soft_limit_in_bytes":     "2",
+				"memory.memsw.limit_in_bytes":    "3",
+				"memory.kmem.limit_in_bytes":     "4",
+				"memory.kmem.tcp.limit_in_bytes": "5",
+				"memory.swappiness":              "6",
+				"memory.oom_control":             "1",
+			},
+		},
+		{
+			// Disable OOM killer should only write when set to true.
+			name: "oomkiller",
+			spec: &specs.LinuxMemory{
+				DisableOOMKiller: boolPtr(false),
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxMemory{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				Memory: tc.spec,
+			}
+			ctrlr := memory{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestNetworkClass(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxNetwork
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxNetwork{
+				ClassID: uint32Ptr(1),
+			},
+			wants: map[string]string{
+				"net_cls.classid": "1",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxNetwork{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				Network: tc.spec,
+			}
+			ctrlr := networkClass{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestNetworkPriority(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxNetwork
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxNetwork{
+				Priorities: []specs.LinuxInterfacePriority{
+					{
+						Name:     "foo",
+						Priority: 1,
+					},
+				},
+			},
+			wants: map[string]string{
+				"net_prio.ifpriomap": "foo 1",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxNetwork{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				Network: tc.spec,
+			}
+			ctrlr := networkPrio{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestPids(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxPids
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxPids{Limit: 1},
+			wants: map[string]string{
+				"pids.max": "1",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxPids{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				Pids: tc.spec,
+			}
+			ctrlr := pids{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestLoadPaths(t *testing.T) {
+	for _, tc := range []struct {
+		name    string
+		cgroups string
+		want    map[string]string
+		err     string
+	}{
+		{
+			name:    "abs-path",
+			cgroups: "0:ctr:/path",
+			want:    map[string]string{"ctr": "/path"},
+		},
+		{
+			name:    "rel-path",
+			cgroups: "0:ctr:rel-path",
+			want:    map[string]string{"ctr": "rel-path"},
+		},
+		{
+			name:    "non-controller",
+			cgroups: "0:name=systemd:/path",
+			want:    map[string]string{"systemd": "/path"},
+		},
+		{
+			name: "empty",
+		},
+		{
+			name: "multiple",
+			cgroups: "0:ctr0:/path0\n" +
+				"1:ctr1:/path1\n" +
+				"2::/empty\n",
+			want: map[string]string{
+				"ctr0": "/path0",
+				"ctr1": "/path1",
+			},
+		},
+		{
+			name:    "missing-field",
+			cgroups: "0:nopath\n",
+			err:     "invalid cgroups file",
+		},
+		{
+			name:    "too-many-fields",
+			cgroups: "0:ctr:/path:extra\n",
+			err:     "invalid cgroups file",
+		},
+		{
+			name: "multiple-malformed",
+			cgroups: "0:ctr0:/path0\n" +
+				"1:ctr1:/path1\n" +
+				"2:\n",
+			err: "invalid cgroups file",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			r := strings.NewReader(tc.cgroups)
+			got, err := loadPathsHelper(r)
+			if len(tc.err) == 0 {
+				if err != nil {
+					t.Fatalf("Unexpected error: %v", err)
+				}
+			} else if !strings.Contains(err.Error(), tc.err) {
+				t.Fatalf("Wrong error message, want: *%s*, got: %v", tc.err, err)
+			}
+			for key, vWant := range tc.want {
+				vGot, ok := got[key]
+				if !ok {
+					t.Errorf("Missing controller %q", key)
+				}
+				if vWant != vGot {
+					t.Errorf("Wrong controller %q value, want: %q, got: %q", key, vWant, vGot)
+				}
+				delete(got, key)
+			}
+			for k, v := range got {
+				t.Errorf("Unexpected controller %q: %q", k, v)
+			}
+		})
+	}
+}
diff --git a/runsc/cli/BUILD b/runsc/cli/BUILD
new file mode 100644
index 000000000..32cce2a18
--- /dev/null
+++ b/runsc/cli/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "cli",
+    srcs = ["main.go"],
+    visibility = [
+        "//:__pkg__",
+        "//runsc:__pkg__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/platform",
+        "//runsc/cmd",
+        "//runsc/config",
+        "//runsc/flag",
+        "//runsc/specutils",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
+)
diff --git a/runsc/cli/main.go b/runsc/cli/main.go
new file mode 100644
index 000000000..bca015db5
--- /dev/null
+++ b/runsc/cli/main.go
@@ -0,0 +1,256 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cli is the main entrypoint for runsc.
+package cli
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/signal"
+	"syscall"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/runsc/cmd"
+	"gvisor.dev/gvisor/runsc/config"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+var (
+	// Although these flags are not part of the OCI spec, they are used by
+	// Docker, and thus should not be changed.
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
+	showVersion   = flag.Bool("version", false, "show version and exit.")
+
+	// These flags are unique to runsc, and are used to configure parts of the
+	// system that are not covered by the runtime spec.
+
+	// Debugging flags.
+	logFD      = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
+)
+
+// Main is the main entrypoint.
+func Main(version string) {
+	// Help and flags commands are generated automatically.
+	help := cmd.NewHelp(subcommands.DefaultCommander)
+	help.Register(new(cmd.Syscalls))
+	subcommands.Register(help, "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+
+	// Installation helpers.
+	const helperGroup = "helpers"
+	subcommands.Register(new(cmd.Install), helperGroup)
+	subcommands.Register(new(cmd.Uninstall), helperGroup)
+
+	// Register user-facing runsc commands.
+	subcommands.Register(new(cmd.Checkpoint), "")
+	subcommands.Register(new(cmd.Create), "")
+	subcommands.Register(new(cmd.Delete), "")
+	subcommands.Register(new(cmd.Do), "")
+	subcommands.Register(new(cmd.Events), "")
+	subcommands.Register(new(cmd.Exec), "")
+	subcommands.Register(new(cmd.Gofer), "")
+	subcommands.Register(new(cmd.Kill), "")
+	subcommands.Register(new(cmd.List), "")
+	subcommands.Register(new(cmd.Pause), "")
+	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Restore), "")
+	subcommands.Register(new(cmd.Resume), "")
+	subcommands.Register(new(cmd.Run), "")
+	subcommands.Register(new(cmd.Spec), "")
+	subcommands.Register(new(cmd.State), "")
+	subcommands.Register(new(cmd.Start), "")
+	subcommands.Register(new(cmd.Wait), "")
+
+	// Register internal commands with the internal group name. This causes
+	// them to be sorted below the user-facing commands with empty group.
+	// The string below will be printed above the commands.
+	const internalGroup = "internal use only"
+	subcommands.Register(new(cmd.Boot), internalGroup)
+	subcommands.Register(new(cmd.Debug), internalGroup)
+	subcommands.Register(new(cmd.Gofer), internalGroup)
+	subcommands.Register(new(cmd.Statefile), internalGroup)
+
+	config.RegisterFlags()
+
+	// All subcommands must be registered before flag parsing.
+	flag.Parse()
+
+	// Are we showing the version?
+	if *showVersion {
+		// The format here is the same as runc.
+		fmt.Fprintf(os.Stdout, "runsc version %s\n", version)
+		fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version)
+		os.Exit(0)
+	}
+
+	// Create a new Config from the flags.
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		cmd.Fatalf(err.Error())
+	}
+
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	if *systemdCgroup {
+		fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
+		os.Exit(1)
+	}
+
+	var errorLogger io.Writer
+	if *logFD > -1 {
+		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
+
+	} else if conf.LogFilename != "" {
+		// We must set O_APPEND and not O_TRUNC because Docker passes
+		// the same log file for all commands (and also parses these
+		// log files), so we can't destroy them on each command.
+		var err error
+		errorLogger, err = os.OpenFile(conf.LogFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", conf.LogFilename, err)
+		}
+	}
+	cmd.ErrorLogger = errorLogger
+
+	if _, err := platform.Lookup(conf.Platform); err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	// Sets the reference leak check mode. Also set it in config below to
+	// propagate it to child processes.
+	refs.SetLeakMode(conf.ReferenceLeak)
+
+	// Set up logging.
+	if conf.Debug {
+		log.SetLevel(log.Debug)
+	}
+
+	// Logging will include the local date and time via the time package.
+	//
+	// On first use, time.Local initializes the local time zone, which
+	// involves opening tzdata files on the host. Since this requires
+	// opening host files, it must be done before syscall filter
+	// installation.
+	//
+	// Generally there will be a log message before filter installation
+	// that will force initialization, but force initialization here in
+	// case that does not occur.
+	_ = time.Local.String()
+
+	subcommand := flag.CommandLine.Arg(0)
+
+	var e log.Emitter
+	if *debugLogFD > -1 {
+		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+
+		e = newEmitter(conf.DebugLogFormat, f)
+
+	} else if conf.DebugLog != "" {
+		f, err := specutils.DebugLogFile(conf.DebugLog, subcommand, "" /* name */)
+		if err != nil {
+			cmd.Fatalf("error opening debug log file in %q: %v", conf.DebugLog, err)
+		}
+		e = newEmitter(conf.DebugLogFormat, f)
+
+	} else {
+		// Stderr is reserved for the application, just discard the logs if no debug
+		// log is specified.
+		e = newEmitter("text", ioutil.Discard)
+	}
+
+	if *panicLogFD > -1 || *debugLogFD > -1 {
+		fd := *panicLogFD
+		if fd < 0 {
+			fd = *debugLogFD
+		}
+		// Quick sanity check to make sure no other commands get passed
+		// a log fd (they should use log dir instead).
+		if subcommand != "boot" && subcommand != "gofer" {
+			cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
+		}
+
+		// If we are the boot process, then we own our stdio FDs and can do what we
+		// want with them. Since Docker and Containerd both eat boot's stderr, we
+		// dup our stderr to the provided log FD so that panics will appear in the
+		// logs, rather than just disappear.
+		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
+		}
+	} else if conf.AlsoLogToStderr {
+		e = &log.MultiEmitter{e, newEmitter(conf.DebugLogFormat, os.Stderr)}
+	}
+
+	log.SetTarget(e)
+
+	log.Infof("***************************")
+	log.Infof("Args: %s", os.Args)
+	log.Infof("Version %s", version)
+	log.Infof("PID: %d", os.Getpid())
+	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
+	log.Infof("Configuration:")
+	log.Infof("\t\tRootDir: %s", conf.RootDir)
+	log.Infof("\t\tPlatform: %v", conf.Platform)
+	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
+	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
+	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
+	log.Infof("***************************")
+
+	if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		// SIGTERM is sent to all processes if a test exceeds its
+		// timeout and this case is handled by syscall_test_runner.
+		log.Warningf("Block the TERM signal. This is only safe in tests!")
+		signal.Ignore(syscall.SIGTERM)
+	}
+
+	// Call the subcommand and pass in the configuration.
+	var ws syscall.WaitStatus
+	subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+	if subcmdCode == subcommands.ExitSuccess {
+		log.Infof("Exiting with status: %v", ws)
+		if ws.Signaled() {
+			// No good way to return it, emulate what the shell does. Maybe raise
+			// signal to self?
+			os.Exit(128 + int(ws.Signal()))
+		}
+		os.Exit(ws.ExitStatus())
+	}
+	// Return an error that is unlikely to be used by the application.
+	log.Warningf("Failure to execute command, err: %v", subcmdCode)
+	os.Exit(128)
+}
+
+func newEmitter(format string, logFile io.Writer) log.Emitter {
+	switch format {
+	case "text":
+		return log.GoogleEmitter{&log.Writer{Next: logFile}}
+	case "json":
+		return log.JSONEmitter{&log.Writer{Next: logFile}}
+	case "json-k8s":
+		return log.K8sJSONEmitter{&log.Writer{Next: logFile}}
+	}
+	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
+	panic("unreachable")
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index af3538ef0..2556f6d9e 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -45,12 +45,13 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/platform",
-        "//pkg/state",
+        "//pkg/state/pretty",
         "//pkg/state/statefile",
         "//pkg/sync",
         "//pkg/unet",
         "//pkg/urpc",
         "//runsc/boot",
+        "//runsc/config",
         "//runsc/console",
         "//runsc/container",
         "//runsc/flag",
@@ -58,7 +59,7 @@ go_library(
         "//runsc/fsgofer/filter",
         "//runsc/specutils",
         "@com_github_google_subcommands//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
@@ -84,12 +85,12 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/test/testutil",
         "//pkg/urpc",
-        "//runsc/boot",
+        "//runsc/config",
         "//runsc/container",
         "//runsc/specutils",
-        "@com_github_google_go-cmp//cmp:go_default_library",
-        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 01204ab4d..2c92e3067 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -54,10 +55,6 @@ type Boot struct {
 	// provided in that order.
 	stdioFDs intFlags
 
-	// console is set to true if the sandbox should allow terminal ioctl(2)
-	// syscalls.
-	console bool
-
 	// applyCaps determines if capabilities defined in the spec should be applied
 	// to the process.
 	applyCaps bool
@@ -115,7 +112,6 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
 	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
 	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
-	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
 	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
 	f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
 	f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
@@ -135,10 +131,10 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return subcommands.ExitUsageError
 	}
 
-	// Ensure that if there is a panic, all goroutine stacks are printed.
-	debug.SetTraceback("system")
+	conf := args[0].(*config.Config)
 
-	conf := args[0].(*boot.Config)
+	// Set traceback level
+	debug.SetTraceback(conf.Traceback)
 
 	if b.attached {
 		// Ensure this process is killed after parent process terminates when
@@ -172,7 +168,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Get the spec from the specFD.
 	specFile := os.NewFile(uintptr(b.specFD), "spec file")
 	defer specFile.Close()
-	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile)
+	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf)
 	if err != nil {
 		Fatalf("reading spec: %v", err)
 	}
@@ -229,7 +225,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Device:       os.NewFile(uintptr(b.deviceFD), "platform device"),
 		GoferFDs:     b.ioFDs.GetArray(),
 		StdioFDs:     b.stdioFDs.GetArray(),
-		Console:      b.console,
 		NumCPU:       b.cpuNum,
 		TotalMem:     b.totalMem,
 		UserLogFD:    b.userLogFD,
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index a84067112..e13a94486 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -24,7 +24,7 @@ import (
 	"github.com/syndtr/gocapability/capability"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -88,7 +88,7 @@ func TestCapabilities(t *testing.T) {
 	conf := testutil.TestConfig(t)
 
 	// Use --network=host to make sandbox use spec's capabilities.
-	conf.Network = boot.NetworkHost
+	conf.Network = config.NetworkHost
 
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 8a29e521e..8fe0c427a 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -22,7 +22,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -72,7 +72,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	cont, err := container.Load(conf.RootDir, id)
@@ -118,7 +118,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 		Fatalf("setting bundleDir")
 	}
 
-	spec, err := specutils.ReadSpec(bundleDir)
+	spec, err := specutils.ReadSpec(bundleDir, conf)
 	if err != nil {
 		Fatalf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 910e97577..e76f7ba1d 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -81,7 +81,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if conf.Rootless {
 		return Errorf("Rootless mode not supported with %q", c.Name())
@@ -91,7 +91,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
-	spec, err := specutils.ReadSpec(bundleDir)
+	spec, err := specutils.ReadSpec(bundleDir, conf)
 	if err != nil {
 		return Errorf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index b5de2588b..132198222 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -25,27 +25,26 @@ import (
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid              int
-	stacks           bool
-	signal           int
-	profileHeap      string
-	profileCPU       string
-	profileGoroutine string
-	profileBlock     string
-	profileMutex     string
-	trace            string
-	strace           string
-	logLevel         string
-	logPackets       string
-	duration         time.Duration
-	ps               bool
+	pid          int
+	stacks       bool
+	signal       int
+	profileHeap  string
+	profileCPU   string
+	profileBlock string
+	profileMutex string
+	trace        string
+	strace       string
+	logLevel     string
+	logPackets   string
+	duration     time.Duration
+	ps           bool
 }
 
 // Name implements subcommands.Command.
@@ -69,7 +68,6 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
-	f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.")
 	f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
 	f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
 	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
@@ -84,7 +82,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.
 func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	var c *container.Container
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if d.pid == 0 {
 		// No pid, container ID must have been provided.
@@ -153,18 +151,6 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("Heap profile written to %q", d.profileHeap)
 	}
-	if d.profileGoroutine != "" {
-		f, err := os.Create(d.profileGoroutine)
-		if err != nil {
-			return Errorf(err.Error())
-		}
-		defer f.Close()
-
-		if err := c.Sandbox.GoroutineProfile(f); err != nil {
-			return Errorf(err.Error())
-		}
-		log.Infof("Goroutine profile written to %q", d.profileGoroutine)
-	}
 	if d.profileBlock != "" {
 		f, err := os.Create(d.profileBlock)
 		if err != nil {
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 0e4863f50..4e49deff8 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -21,7 +21,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -59,14 +59,14 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	if err := d.execute(f.Args(), conf); err != nil {
 		Fatalf("%v", err)
 	}
 	return subcommands.ExitSuccess
 }
 
-func (d *Delete) execute(ids []string, conf *boot.Config) error {
+func (d *Delete) execute(ids []string, conf *config.Config) error {
 	for _, id := range ids {
 		c, err := container.Load(conf.RootDir, id)
 		if err != nil {
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index cb59516a3..e2d994a05 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -18,7 +18,7 @@ import (
 	"io/ioutil"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func TestNotFound(t *testing.T) {
@@ -27,7 +27,7 @@ func TestNotFound(t *testing.T) {
 	if err != nil {
 		t.Fatalf("error creating dir: %v", err)
 	}
-	conf := &boot.Config{RootDir: dir}
+	conf := &config.Config{RootDir: dir}
 
 	d := Delete{}
 	if err := d.execute(ids, conf); err == nil {
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 7d1310c96..640de4c47 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -17,6 +17,7 @@ package cmd
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io/ioutil"
 	"math/rand"
@@ -30,12 +31,14 @@ import (
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
+var errNoDefaultInterface = errors.New("no default interface found")
+
 // Do implements subcommands.Command for the "do" command. It sets up a simple
 // sandbox and executes the command inside it. See Usage() for more details.
 type Do struct {
@@ -82,7 +85,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
@@ -125,27 +128,29 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	specutils.LogSpec(spec)
 
 	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
-	if conf.Network == boot.NetworkNone {
-		netns := specs.LinuxNamespace{
-			Type: specs.NetworkNamespace,
-		}
-		if spec.Linux != nil {
-			panic("spec.Linux is not nil")
-		}
-		spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}}
+	if conf.Network == config.NetworkNone {
+		addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace})
 
 	} else if conf.Rootless {
-		if conf.Network == boot.NetworkSandbox {
-			c.notifyUser("*** Warning: using host network due to --rootless ***")
-			conf.Network = boot.NetworkHost
+		if conf.Network == config.NetworkSandbox {
+			c.notifyUser("*** Warning: sandbox network isn't supported with --rootless, switching to host ***")
+			conf.Network = config.NetworkHost
 		}
 
 	} else {
-		clean, err := c.setupNet(cid, spec)
-		if err != nil {
+		switch clean, err := c.setupNet(cid, spec); err {
+		case errNoDefaultInterface:
+			log.Warningf("Network interface not found, using internal network")
+			addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace})
+			conf.Network = config.NetworkHost
+
+		case nil:
+			// Setup successfull.
+			defer clean()
+
+		default:
 			return Errorf("Error setting up network: %v", err)
 		}
-		defer clean()
 	}
 
 	out, err := json.Marshal(spec)
@@ -199,6 +204,13 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	return subcommands.ExitSuccess
 }
 
+func addNamespace(spec *specs.Spec, ns specs.LinuxNamespace) {
+	if spec.Linux == nil {
+		spec.Linux = &specs.Linux{}
+	}
+	spec.Linux.Namespaces = append(spec.Linux.Namespaces, ns)
+}
+
 func (c *Do) notifyUser(format string, v ...interface{}) {
 	if !c.quiet {
 		fmt.Printf(format+"\n", v...)
@@ -219,10 +231,14 @@ func resolvePath(path string) (string, error) {
 	return path, nil
 }
 
+// setupNet setups up the sandbox network, including the creation of a network
+// namespace, and iptable rules to redirect the traffic. Returns a cleanup
+// function to tear down the network. Returns errNoDefaultInterface when there
+// is no network interface available to setup the network.
 func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
 	dev, err := defaultDevice()
 	if err != nil {
-		return nil, err
+		return nil, errNoDefaultInterface
 	}
 	peerIP, err := calculatePeerIP(c.ip)
 	if err != nil {
@@ -279,14 +295,11 @@ func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
 		return nil, err
 	}
 
-	if spec.Linux == nil {
-		spec.Linux = &specs.Linux{}
-	}
 	netns := specs.LinuxNamespace{
 		Type: specs.NetworkNamespace,
 		Path: filepath.Join("/var/run/netns", cid),
 	}
-	spec.Linux.Namespaces = append(spec.Linux.Namespaces, netns)
+	addNamespace(spec, netns)
 
 	return func() { c.cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath) }, nil
 }
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 51f6a98ed..25fe2cf1c 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -22,7 +22,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -72,7 +72,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index d9a94903e..775ed4b43 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -33,7 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/urpc"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/console"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
@@ -105,7 +105,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute. It starts a process in an
 // already created container.
 func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	e, id, err := ex.parseArgs(f, conf.EnableRaw)
 	if err != nil {
 		Fatalf("parsing process spec: %v", err)
@@ -220,7 +220,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
 	cmd.Stderr = os.Stderr
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the TTY on the sandbox process.
+	// pty master/replica pair and set the TTY on the sandbox process.
 	if ex.consoleSocket != "" {
 		// Create a new TTY pair and send the master on the provided socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
@@ -229,7 +229,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
 		}
 		defer tty.Close()
 
-		// Set stdio to the new TTY slave.
+		// Set stdio to the new TTY replica.
 		cmd.Stdin = tty
 		cmd.Stdout = tty
 		cmd.Stderr = tty
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 10448a759..371fcc0ae 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -30,7 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 	"gvisor.dev/gvisor/runsc/fsgofer/filter"
@@ -62,9 +62,8 @@ type Gofer struct {
 	applyCaps bool
 	setUpRoot bool
 
-	panicOnWrite bool
-	specFD       int
-	mountsFD     int
+	specFD   int
+	mountsFD int
 }
 
 // Name implements subcommands.Command.
@@ -87,7 +86,6 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
 	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
 	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
-	f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
 	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
 	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
 	f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
@@ -100,15 +98,15 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return subcommands.ExitUsageError
 	}
 
+	conf := args[0].(*config.Config)
+
 	specFile := os.NewFile(uintptr(g.specFD), "spec file")
 	defer specFile.Close()
-	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
+	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf)
 	if err != nil {
 		Fatalf("reading spec: %v", err)
 	}
 
-	conf := args[0].(*boot.Config)
-
 	if g.setUpRoot {
 		if err := setupRootFS(spec, conf); err != nil {
 			Fatalf("Error setting up root FS: %v", err)
@@ -168,8 +166,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Start with root mount, then add any other additional mount as needed.
 	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
 	ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
-		ROMount:      spec.Root.Readonly || conf.Overlay,
-		PanicOnWrite: g.panicOnWrite,
+		ROMount: spec.Root.Readonly || conf.Overlay,
 	})
 	if err != nil {
 		Fatalf("creating attach point: %v", err)
@@ -181,9 +178,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			cfg := fsgofer.Config{
-				ROMount:      isReadonlyMount(m.Options) || conf.Overlay,
-				PanicOnWrite: g.panicOnWrite,
-				HostUDS:      conf.FSGoferHostUDS,
+				ROMount: isReadonlyMount(m.Options) || conf.Overlay,
+				HostUDS: conf.FSGoferHostUDS,
 			}
 			ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
 			if err != nil {
@@ -263,7 +259,7 @@ func isReadonlyMount(opts []string) bool {
 	return false
 }
 
-func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
+func setupRootFS(spec *specs.Spec, conf *config.Config) error {
 	// Convert all shared mounts into slaves to be sure that nothing will be
 	// propagated outside of our namespace.
 	if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
@@ -306,7 +302,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 	}
 
 	// Replace the current spec, with the clean spec with symlinks resolved.
-	if err := setupMounts(spec.Mounts, root); err != nil {
+	if err := setupMounts(conf, spec.Mounts, root); err != nil {
 		Fatalf("error setting up FS: %v", err)
 	}
 
@@ -316,13 +312,14 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 		if err != nil {
 			return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
 		}
+		log.Infof("Create working directory %q if needed", spec.Process.Cwd)
 		if err := os.MkdirAll(dst, 0755); err != nil {
 			return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
 		}
 	}
 
 	// Check if root needs to be remounted as readonly.
-	if spec.Root.Readonly {
+	if spec.Root.Readonly || conf.Overlay {
 		// If root is a mount point but not read-only, we can change mount options
 		// to make it read-only for extra safety.
 		log.Infof("Remounting root as readonly: %q", root)
@@ -346,7 +343,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 // setupMounts binds mount all mounts specified in the spec in their correct
 // location inside root. It will resolve relative paths and symlinks. It also
 // creates directories as needed.
-func setupMounts(mounts []specs.Mount, root string) error {
+func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error {
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
 			continue
@@ -358,6 +355,11 @@ func setupMounts(mounts []specs.Mount, root string) error {
 		}
 
 		flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND
+		if conf.Overlay {
+			// Force mount read-only if writes are not going to be sent to it.
+			flags |= syscall.MS_RDONLY
+		}
+
 		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
 		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
 			return fmt.Errorf("mounting %v: %v", m, err)
@@ -380,7 +382,7 @@ func setupMounts(mounts []specs.Mount, root string) error {
 // Otherwise, it may follow symlinks to locations that would be overwritten
 // with another mount point and return the wrong location. In short, make sure
 // setupMounts() has been called before.
-func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
+func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
 	cleanMounts := make([]specs.Mount, 0, len(mounts))
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
@@ -462,7 +464,7 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
 }
 
 // adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
-func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) {
+func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
 	rv := make([]string, len(opts))
 	copy(rv, opts)
 
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 8282ea0e0..04eee99b2 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -23,7 +23,7 @@ import (
 
 	"github.com/google/subcommands"
 	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -63,7 +63,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	if k.pid != 0 && k.all {
 		Fatalf("it is invalid to specify both --all and --pid")
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index d8d906fe3..f92d6fef9 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -24,7 +24,7 @@ import (
 
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -63,7 +63,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		return subcommands.ExitUsageError
 	}
 
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	ids, err := container.List(conf.RootDir)
 	if err != nil {
 		Fatalf("%v", err)
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index 6f95a9837..0eb1402ed 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -53,7 +53,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 7fb8041af..bc58c928f 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -20,7 +20,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/sentry/control"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -58,7 +58,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 72584b326..096ec814c 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -20,7 +20,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -77,7 +77,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
@@ -88,7 +88,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
-	spec, err := specutils.ReadSpec(bundleDir)
+	spec, err := specutils.ReadSpec(bundleDir, conf)
 	if err != nil {
 		return Errorf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index 61a55a554..f24823f99 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -18,7 +18,7 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -54,7 +54,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	cont, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index cf41581ad..c48cbe4cd 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -19,7 +19,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -64,7 +64,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	if conf.Rootless {
@@ -75,7 +75,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
-	spec, err := specutils.ReadSpec(bundleDir)
+	spec, err := specutils.ReadSpec(bundleDir, conf)
 	if err != nil {
 		return Errorf("reading spec: %v", err)
 	}
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index a2b0a4b14..55194e641 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -16,124 +16,122 @@ package cmd
 
 import (
 	"context"
-	"fmt"
-	"io/ioutil"
+	"encoding/json"
+	"io"
 	"os"
 	"path/filepath"
 
 	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/runsc/flag"
 )
 
-func genSpec(cwd string) []byte {
-	var template = fmt.Sprintf(`{
-	"ociVersion": "1.0.0",
-	"process": {
-		"terminal": true,
-		"user": {
-			"uid": 0,
-			"gid": 0
-		},
-		"args": [
-			"sh"
-		],
-		"env": [
-			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
-			"TERM=xterm"
-		],
-		"cwd": "%s",
-		"capabilities": {
-			"bounding": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			],
-			"effective": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			],
-			"inheritable": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			],
-			"permitted": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			],
-			"ambient": [
-				"CAP_AUDIT_WRITE",
-				"CAP_KILL",
-				"CAP_NET_BIND_SERVICE"
-			]
-		},
-		"rlimits": [
-			{
-				"type": "RLIMIT_NOFILE",
-				"hard": 1024,
-				"soft": 1024
-			}
-		]
-	},
-	"root": {
-		"path": "rootfs",
-		"readonly": true
-	},
-	"hostname": "runsc",
-	"mounts": [
-		{
-			"destination": "/proc",
-			"type": "proc",
-			"source": "proc"
+func writeSpec(w io.Writer, cwd string, netns string, args []string) error {
+	spec := &specs.Spec{
+		Version: "1.0.0",
+		Process: &specs.Process{
+			Terminal: true,
+			User: specs.User{
+				UID: 0,
+				GID: 0,
+			},
+			Args: args,
+			Env: []string{
+				"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+				"TERM=xterm",
+			},
+			Cwd: cwd,
+			Capabilities: &specs.LinuxCapabilities{
+				Bounding: []string{
+					"CAP_AUDIT_WRITE",
+					"CAP_KILL",
+					"CAP_NET_BIND_SERVICE",
+				},
+				Effective: []string{
+					"CAP_AUDIT_WRITE",
+					"CAP_KILL",
+					"CAP_NET_BIND_SERVICE",
+				},
+				Inheritable: []string{
+					"CAP_AUDIT_WRITE",
+					"CAP_KILL",
+					"CAP_NET_BIND_SERVICE",
+				},
+				Permitted: []string{
+					"CAP_AUDIT_WRITE",
+					"CAP_KILL",
+					"CAP_NET_BIND_SERVICE",
+				},
+				// TODO(gvisor.dev/issue/3166): support ambient capabilities
+			},
+			Rlimits: []specs.POSIXRlimit{
+				{
+					Type: "RLIMIT_NOFILE",
+					Hard: 1024,
+					Soft: 1024,
+				},
+			},
 		},
-		{
-			"destination": "/dev",
-			"type": "tmpfs",
-			"source": "tmpfs",
-			"options": []
+		Root: &specs.Root{
+			Path:     "rootfs",
+			Readonly: true,
 		},
-		{
-			"destination": "/sys",
-			"type": "sysfs",
-			"source": "sysfs",
-			"options": [
-				"nosuid",
-				"noexec",
-				"nodev",
-				"ro"
-			]
-		}
-	],
-	"linux": {
-		"namespaces": [
+		Hostname: "runsc",
+		Mounts: []specs.Mount{
 			{
-				"type": "pid"
+				Destination: "/proc",
+				Type:        "proc",
+				Source:      "proc",
 			},
 			{
-				"type": "network"
+				Destination: "/dev",
+				Type:        "tmpfs",
+				Source:      "tmpfs",
 			},
 			{
-				"type": "ipc"
+				Destination: "/sys",
+				Type:        "sysfs",
+				Source:      "sysfs",
+				Options: []string{
+					"nosuid",
+					"noexec",
+					"nodev",
+					"ro",
+				},
 			},
-			{
-				"type": "uts"
+		},
+		Linux: &specs.Linux{
+			Namespaces: []specs.LinuxNamespace{
+				{
+					Type: "pid",
+				},
+				{
+					Type: "network",
+					Path: netns,
+				},
+				{
+					Type: "ipc",
+				},
+				{
+					Type: "uts",
+				},
+				{
+					Type: "mount",
+				},
 			},
-			{
-				"type": "mount"
-			}
-		]
+		},
 	}
-}`, cwd)
 
-	return []byte(template)
+	e := json.NewEncoder(w)
+	e.SetIndent("", "    ")
+	return e.Encode(spec)
 }
 
 // Spec implements subcommands.Command for the "spec" command.
 type Spec struct {
 	bundle string
 	cwd    string
+	netns  string
 }
 
 // Name implements subcommands.Command.Name.
@@ -148,21 +146,26 @@ func (*Spec) Synopsis() string {
 
 // Usage implements subcommands.Command.Usage.
 func (*Spec) Usage() string {
-	return `spec [options] - create a new OCI bundle specification file.
+	return `spec [options] [-- args...] - create a new OCI bundle specification file.
+
+The spec command creates a new specification file (config.json) for a new OCI
+bundle.
 
-The spec command creates a new specification file (config.json) for a new OCI bundle.
+The specification file is a starter file that runs the command specified by
+'args' in the container. If 'args' is not specified the default is to run the
+'sh' program.
 
-The specification file is a starter file that runs the "sh" command in the container. You
-should edit the file to suit your needs. You can find out more about the format of the
-specification file by visiting the OCI runtime spec repository:
+While a number of flags are provided to change values in the specification, you
+can examine the file and edit it to suit your needs after this command runs.
+You can find out more about the format of the specification file by visiting
+the OCI runtime spec repository:
 https://github.com/opencontainers/runtime-spec/
 
 EXAMPLE:
     $ mkdir -p bundle/rootfs
     $ cd bundle
-    $ runsc spec
+    $ runsc spec -- /hello
     $ docker export $(docker create hello-world) | tar -xf - -C rootfs
-    $ sed -i 's;"sh";"/hello";' config.json
     $ sudo runsc run hello
 
 `
@@ -173,18 +176,29 @@ func (s *Spec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
 	f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+
 		"this value MUST be an absolute path")
+	f.StringVar(&s.netns, "netns", "", "network namespace path")
 }
 
 // Execute implements subcommands.Command.Execute.
 func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	// Grab the arguments.
+	containerArgs := f.Args()
+	if len(containerArgs) == 0 {
+		containerArgs = []string{"sh"}
+	}
+
 	confPath := filepath.Join(s.bundle, "config.json")
 	if _, err := os.Stat(confPath); !os.IsNotExist(err) {
 		Fatalf("file %q already exists", confPath)
 	}
 
-	var spec = genSpec(s.cwd)
+	configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664)
+	if err != nil {
+		Fatalf("opening file %q: %v", confPath, err)
+	}
 
-	if err := ioutil.WriteFile(confPath, spec, 0664); err != nil {
+	err = writeSpec(configFile, s.cwd, s.netns, containerArgs)
+	if err != nil {
 		Fatalf("writing to %q: %v", confPath, err)
 	}
 
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 0205fd9f7..139edbd49 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -18,9 +18,10 @@ import (
 	"context"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 // Start implements subcommands.Command for the "start" command.
@@ -52,12 +53,18 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
+	// Read the spec again here to ensure flag annotations from the spec are
+	// applied to "conf".
+	if _, err := specutils.ReadSpec(c.BundleDir, conf); err != nil {
+		Fatalf("reading spec: %v", err)
+	}
+
 	if err := c.Start(conf); err != nil {
 		Fatalf("starting container: %v", err)
 	}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index cf2413deb..2bd2ab9f8 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -21,7 +21,7 @@ import (
 
 	"github.com/google/subcommands"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -55,7 +55,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go
index e6f1907da..daed9e728 100644
--- a/runsc/cmd/statefile.go
+++ b/runsc/cmd/statefile.go
@@ -20,7 +20,7 @@ import (
 	"os"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/state/pretty"
 	"gvisor.dev/gvisor/pkg/state/statefile"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -105,8 +105,14 @@ func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interfac
 		if err != nil {
 			Fatalf("error parsing statefile: %v", err)
 		}
-		if err := state.PrettyPrint(output, rc, s.html); err != nil {
-			Fatalf("error printing state: %v", err)
+		if s.html {
+			if err := pretty.PrintHTML(output, rc); err != nil {
+				Fatalf("error printing state: %v", err)
+			}
+		} else {
+			if err := pretty.PrintText(output, rc); err != nil {
+				Fatalf("error printing state: %v", err)
+			}
 		}
 		return subcommands.ExitSuccess
 	}
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 29c0a15f0..28d0642ed 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -21,7 +21,7 @@ import (
 	"syscall"
 
 	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
 )
@@ -70,7 +70,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	id := f.Arg(0)
-	conf := args[0].(*boot.Config)
+	conf := args[0].(*config.Config)
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
diff --git a/runsc/config/BUILD b/runsc/config/BUILD
new file mode 100644
index 000000000..b1672bb9d
--- /dev/null
+++ b/runsc/config/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "config",
+    srcs = [
+        "config.go",
+        "flags.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/refs",
+        "//pkg/sentry/watchdog",
+        "//pkg/sync",
+        "//runsc/flag",
+    ],
+)
+
+go_test(
+    name = "config_test",
+    size = "small",
+    srcs = [
+        "config_test.go",
+    ],
+    library = ":config",
+    deps = ["//runsc/flag"],
+)
diff --git a/runsc/boot/config.go b/runsc/config/config.go
index bcec7e4db..b02d8e2e1 100644
--- a/runsc/boot/config.go
+++ b/runsc/config/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,214 +12,115 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package boot
+// Package config provides basic infrastructure to set configuration settings
+// for runsc. The configuration is set by flags to the command line. They can
+// also propagate to a different process using the same flags.
+package config
 
 import (
 	"fmt"
-	"strconv"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 )
 
-// FileAccessType tells how the filesystem is accessed.
-type FileAccessType int
-
-const (
-	// FileAccessShared sends IO requests to a Gofer process that validates the
-	// requests and forwards them to the host.
-	FileAccessShared FileAccessType = iota
-
-	// FileAccessExclusive is the same as FileAccessShared, but enables
-	// extra caching for improved performance. It should only be used if
-	// the sandbox has exclusive access to the filesystem.
-	FileAccessExclusive
-)
-
-// MakeFileAccessType converts type from string.
-func MakeFileAccessType(s string) (FileAccessType, error) {
-	switch s {
-	case "shared":
-		return FileAccessShared, nil
-	case "exclusive":
-		return FileAccessExclusive, nil
-	default:
-		return 0, fmt.Errorf("invalid file access type %q", s)
-	}
-}
-
-func (f FileAccessType) String() string {
-	switch f {
-	case FileAccessShared:
-		return "shared"
-	case FileAccessExclusive:
-		return "exclusive"
-	default:
-		return fmt.Sprintf("unknown(%d)", f)
-	}
-}
-
-// NetworkType tells which network stack to use.
-type NetworkType int
-
-const (
-	// NetworkSandbox uses internal network stack, isolated from the host.
-	NetworkSandbox NetworkType = iota
-
-	// NetworkHost redirects network related syscalls to the host network.
-	NetworkHost
-
-	// NetworkNone sets up just loopback using netstack.
-	NetworkNone
-)
-
-// MakeNetworkType converts type from string.
-func MakeNetworkType(s string) (NetworkType, error) {
-	switch s {
-	case "sandbox":
-		return NetworkSandbox, nil
-	case "host":
-		return NetworkHost, nil
-	case "none":
-		return NetworkNone, nil
-	default:
-		return 0, fmt.Errorf("invalid network type %q", s)
-	}
-}
-
-func (n NetworkType) String() string {
-	switch n {
-	case NetworkSandbox:
-		return "sandbox"
-	case NetworkHost:
-		return "host"
-	case NetworkNone:
-		return "none"
-	default:
-		return fmt.Sprintf("unknown(%d)", n)
-	}
-}
-
-// MakeWatchdogAction converts type from string.
-func MakeWatchdogAction(s string) (watchdog.Action, error) {
-	switch strings.ToLower(s) {
-	case "log", "logwarning":
-		return watchdog.LogWarning, nil
-	case "panic":
-		return watchdog.Panic, nil
-	default:
-		return 0, fmt.Errorf("invalid watchdog action %q", s)
-	}
-}
-
-// MakeRefsLeakMode converts type from string.
-func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
-	switch strings.ToLower(s) {
-	case "disabled":
-		return refs.NoLeakChecking, nil
-	case "log-names":
-		return refs.LeaksLogWarning, nil
-	case "log-traces":
-		return refs.LeaksLogTraces, nil
-	default:
-		return 0, fmt.Errorf("invalid refs leakmode %q", s)
-	}
-}
-
-func refsLeakModeToString(mode refs.LeakMode) string {
-	switch mode {
-	// If not set, default it to disabled.
-	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
-		return "disabled"
-	case refs.LeaksLogWarning:
-		return "log-names"
-	case refs.LeaksLogTraces:
-		return "log-traces"
-	default:
-		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
-	}
-}
-
 // Config holds configuration that is not part of the runtime spec.
+//
+// Follow these steps to add a new flag:
+//   1. Create a new field in Config.
+//   2. Add a field tag with the flag name
+//   3. Register a new flag in flags.go, with name and description
+//   4. Add any necessary validation into validate()
+//   5. If adding an enum, follow the same pattern as FileAccessType
+//
 type Config struct {
 	// RootDir is the runtime root directory.
-	RootDir string
+	RootDir string `flag:"root"`
+
+	// Traceback changes the Go runtime's traceback level.
+	Traceback string `flag:"traceback"`
 
 	// Debug indicates that debug logging should be enabled.
-	Debug bool
+	Debug bool `flag:"debug"`
 
 	// LogFilename is the filename to log to, if not empty.
-	LogFilename string
+	LogFilename string `flag:"log"`
 
 	// LogFormat is the log format.
-	LogFormat string
+	LogFormat string `flag:"log-format"`
 
 	// DebugLog is the path to log debug information to, if not empty.
-	DebugLog string
+	DebugLog string `flag:"debug-log"`
 
 	// PanicLog is the path to log GO's runtime messages, if not empty.
-	PanicLog string
+	PanicLog string `flag:"panic-log"`
 
 	// DebugLogFormat is the log format for debug.
-	DebugLogFormat string
+	DebugLogFormat string `flag:"debug-log-format"`
 
 	// FileAccess indicates how the filesystem is accessed.
-	FileAccess FileAccessType
+	FileAccess FileAccessType `flag:"file-access"`
 
 	// Overlay is whether to wrap the root filesystem in an overlay.
-	Overlay bool
+	Overlay bool `flag:"overlay"`
 
 	// FSGoferHostUDS enables the gofer to mount a host UDS.
-	FSGoferHostUDS bool
+	FSGoferHostUDS bool `flag:"fsgofer-host-uds"`
 
 	// Network indicates what type of network to use.
-	Network NetworkType
+	Network NetworkType `flag:"network"`
 
 	// EnableRaw indicates whether raw sockets should be enabled. Raw
 	// sockets are disabled by stripping CAP_NET_RAW from the list of
 	// capabilities.
-	EnableRaw bool
+	EnableRaw bool `flag:"net-raw"`
 
 	// HardwareGSO indicates that hardware segmentation offload is enabled.
-	HardwareGSO bool
+	HardwareGSO bool `flag:"gso"`
 
 	// SoftwareGSO indicates that software segmentation offload is enabled.
-	SoftwareGSO bool
+	SoftwareGSO bool `flag:"software-gso"`
+
+	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
+	TXChecksumOffload bool `flag:"tx-checksum-offload"`
+
+	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
+	RXChecksumOffload bool `flag:"rx-checksum-offload"`
 
 	// QDisc indicates the type of queuening discipline to use by default
 	// for non-loopback interfaces.
-	QDisc QueueingDiscipline
+	QDisc QueueingDiscipline `flag:"qdisc"`
 
 	// LogPackets indicates that all network packets should be logged.
-	LogPackets bool
+	LogPackets bool `flag:"log-packets"`
 
 	// Platform is the platform to run on.
-	Platform string
+	Platform string `flag:"platform"`
 
 	// Strace indicates that strace should be enabled.
-	Strace bool
+	Strace bool `flag:"strace"`
 
-	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
-	// true and this list is empty, then all syscalls will be traced.
-	StraceSyscalls []string
+	// StraceSyscalls is the set of syscalls to trace (comma-separated values).
+	// If StraceEnable is true and this string is empty, then all syscalls will
+	// be traced.
+	StraceSyscalls string `flag:"strace-syscalls"`
 
 	// StraceLogSize is the max size of data blobs to display.
-	StraceLogSize uint
+	StraceLogSize uint `flag:"strace-log-size"`
 
 	// DisableSeccomp indicates whether seccomp syscall filters should be
 	// disabled. Pardon the double negation, but default to enabled is important.
 	DisableSeccomp bool
 
 	// WatchdogAction sets what action the watchdog takes when triggered.
-	WatchdogAction watchdog.Action
+	WatchdogAction watchdog.Action `flag:"watchdog-action"`
 
 	// PanicSignal registers signal handling that panics. Usually set to
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
-	PanicSignal int
+	PanicSignal int `flag:"panic-signal"`
 
 	// ProfileEnable is set to prepare the sandbox to be profiled.
-	ProfileEnable bool
+	ProfileEnable bool `flag:"profile"`
 
 	// RestoreFile is the path to the saved container image
 	RestoreFile string
@@ -227,95 +128,215 @@ type Config struct {
 	// NumNetworkChannels controls the number of AF_PACKET sockets that map
 	// to the same underlying network device. This allows netstack to better
 	// scale for high throughput use cases.
-	NumNetworkChannels int
+	NumNetworkChannels int `flag:"num-network-channels"`
 
 	// Rootless allows the sandbox to be started with a user that is not root.
 	// Defense is depth measures are weaker with rootless. Specifically, the
 	// sandbox and Gofer process run as root inside a user namespace with root
 	// mapped to the caller's user.
-	Rootless bool
+	Rootless bool `flag:"rootless"`
 
 	// AlsoLogToStderr allows to send log messages to stderr.
-	AlsoLogToStderr bool
+	AlsoLogToStderr bool `flag:"alsologtostderr"`
 
 	// ReferenceLeakMode sets reference leak check mode
-	ReferenceLeakMode refs.LeakMode
+	ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"`
 
 	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
 	// is on a Linux overlayfs mount, which does not necessarily preserve
 	// coherence between read-only and subsequent writable file descriptors
 	// representing the "same" file.
-	OverlayfsStaleRead bool
+	OverlayfsStaleRead bool `flag:"overlayfs-stale-read"`
+
+	// CPUNumFromQuota sets CPU number count to available CPU quota, using
+	// least integer value greater than or equal to quota.
+	//
+	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+	CPUNumFromQuota bool `flag:"cpu-num-from-quota"`
+
+	// Enables VFS2.
+	VFS2 bool `flag:"vfs2"`
+
+	// Enables FUSE usage.
+	FUSE bool `flag:"fuse"`
+
+	// Allows overriding of flags in OCI annotations.
+	AllowFlagOverride bool `flag:"allow-flag-override"`
+
+	// Enables seccomp inside the sandbox.
+	OCISeccomp bool `flag:"oci-seccomp"`
 
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
 	// necessary in test environments that have limited capabilities.
-	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"`
 
 	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
 	// test name in the container environment variables and adds it to the debug
 	// log file name. This is done to help identify the log with the test when
 	// multiple tests are run in parallel, since there is no way to pass
 	// parameters to the runtime from docker.
-	TestOnlyTestNameEnv string
+	TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"`
+}
 
-	// CPUNumFromQuota sets CPU number count to available CPU quota, using
-	// least integer value greater than or equal to quota.
-	//
-	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
-	CPUNumFromQuota bool
+func (c *Config) validate() error {
+	if c.FileAccess == FileAccessShared && c.Overlay {
+		return fmt.Errorf("overlay flag is incompatible with shared file access")
+	}
+	if c.NumNetworkChannels <= 0 {
+		return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels)
+	}
+	return nil
+}
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
 
-	// Enables VFS2 (not plumbled through yet).
-	VFS2 bool
+const (
+	// FileAccessExclusive is the same as FileAccessShared, but enables
+	// extra caching for improved performance. It should only be used if
+	// the sandbox has exclusive access to the filesystem.
+	FileAccessExclusive FileAccessType = iota
+
+	// FileAccessShared sends IO requests to a Gofer process that validates the
+	// requests and forwards them to the host.
+	FileAccessShared
+)
+
+func fileAccessTypePtr(v FileAccessType) *FileAccessType {
+	return &v
 }
 
-// ToFlags returns a slice of flags that correspond to the given Config.
-func (c *Config) ToFlags() []string {
-	f := []string{
-		"--root=" + c.RootDir,
-		"--debug=" + strconv.FormatBool(c.Debug),
-		"--log=" + c.LogFilename,
-		"--log-format=" + c.LogFormat,
-		"--debug-log=" + c.DebugLog,
-		"--panic-log=" + c.PanicLog,
-		"--debug-log-format=" + c.DebugLogFormat,
-		"--file-access=" + c.FileAccess.String(),
-		"--overlay=" + strconv.FormatBool(c.Overlay),
-		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
-		"--network=" + c.Network.String(),
-		"--log-packets=" + strconv.FormatBool(c.LogPackets),
-		"--platform=" + c.Platform,
-		"--strace=" + strconv.FormatBool(c.Strace),
-		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
-		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
-		"--watchdog-action=" + c.WatchdogAction.String(),
-		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
-		"--profile=" + strconv.FormatBool(c.ProfileEnable),
-		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
-		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
-		"--rootless=" + strconv.FormatBool(c.Rootless),
-		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
-		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
-		"--gso=" + strconv.FormatBool(c.HardwareGSO),
-		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
-		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
-		"--qdisc=" + c.QDisc.String(),
+// Set implements flag.Value.
+func (f *FileAccessType) Set(v string) error {
+	switch v {
+	case "shared":
+		*f = FileAccessShared
+	case "exclusive":
+		*f = FileAccessExclusive
+	default:
+		return fmt.Errorf("invalid file access type %q", v)
+	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (f *FileAccessType) Get() interface{} {
+	return *f
+}
+
+// String implements flag.Value.
+func (f *FileAccessType) String() string {
+	switch *f {
+	case FileAccessShared:
+		return "shared"
+	case FileAccessExclusive:
+		return "exclusive"
 	}
-	if c.CPUNumFromQuota {
-		f = append(f, "--cpu-num-from-quota")
+	panic(fmt.Sprintf("Invalid file access type %v", *f))
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+	// NetworkSandbox uses internal network stack, isolated from the host.
+	NetworkSandbox NetworkType = iota
+
+	// NetworkHost redirects network related syscalls to the host network.
+	NetworkHost
+
+	// NetworkNone sets up just loopback using netstack.
+	NetworkNone
+)
+
+func networkTypePtr(v NetworkType) *NetworkType {
+	return &v
+}
+
+// Set implements flag.Value.
+func (n *NetworkType) Set(v string) error {
+	switch v {
+	case "sandbox":
+		*n = NetworkSandbox
+	case "host":
+		*n = NetworkHost
+	case "none":
+		*n = NetworkNone
+	default:
+		return fmt.Errorf("invalid network type %q", v)
 	}
-	// Only include these if set since it is never to be used by users.
-	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		f = append(f, "--TESTONLY-unsafe-nonroot=true")
+	return nil
+}
+
+// Get implements flag.Value.
+func (n *NetworkType) Get() interface{} {
+	return *n
+}
+
+// String implements flag.Value.
+func (n *NetworkType) String() string {
+	switch *n {
+	case NetworkSandbox:
+		return "sandbox"
+	case NetworkHost:
+		return "host"
+	case NetworkNone:
+		return "none"
 	}
-	if len(c.TestOnlyTestNameEnv) != 0 {
-		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
+	panic(fmt.Sprintf("Invalid network type %v", *n))
+}
+
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+	// QDiscNone disables any queueing for the underlying FD.
+	QDiscNone QueueingDiscipline = iota
+
+	// QDiscFIFO applies a simple fifo based queue to the underlying FD.
+	QDiscFIFO
+)
+
+func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline {
+	return &v
+}
+
+// Set implements flag.Value.
+func (q *QueueingDiscipline) Set(v string) error {
+	switch v {
+	case "none":
+		*q = QDiscNone
+	case "fifo":
+		*q = QDiscFIFO
+	default:
+		return fmt.Errorf("invalid qdisc %q", v)
 	}
+	return nil
+}
+
+// Get implements flag.Value.
+func (q *QueueingDiscipline) Get() interface{} {
+	return *q
+}
 
-	if c.VFS2 {
-		f = append(f, "--vfs2=true")
+// String implements flag.Value.
+func (q *QueueingDiscipline) String() string {
+	switch *q {
+	case QDiscNone:
+		return "none"
+	case QDiscFIFO:
+		return "fifo"
 	}
+	panic(fmt.Sprintf("Invalid qdisc %v", *q))
+}
+
+func leakModePtr(v refs.LeakMode) *refs.LeakMode {
+	return &v
+}
 
-	return f
+func watchdogActionPtr(v watchdog.Action) *watchdog.Action {
+	return &v
 }
diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go
new file mode 100644
index 000000000..fb162b7eb
--- /dev/null
+++ b/runsc/config/config_test.go
@@ -0,0 +1,272 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package config
+
+import (
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+func init() {
+	RegisterFlags()
+}
+
+func TestDefault(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	// "--root" is always set to something different than the default. Reset it
+	// to make it easier to test that default values do not generate flags.
+	c.RootDir = ""
+
+	// All defaults doesn't require setting flags.
+	flags := c.ToFlags()
+	if len(flags) > 0 {
+		t.Errorf("default flags not set correctly for: %s", flags)
+	}
+}
+
+func setDefault(name string) {
+	fl := flag.CommandLine.Lookup(name)
+	fl.Value.Set(fl.DefValue)
+}
+
+func TestFromFlags(t *testing.T) {
+	flag.CommandLine.Lookup("root").Value.Set("some-path")
+	flag.CommandLine.Lookup("debug").Value.Set("true")
+	flag.CommandLine.Lookup("num-network-channels").Value.Set("123")
+	flag.CommandLine.Lookup("network").Value.Set("none")
+	defer func() {
+		setDefault("root")
+		setDefault("debug")
+		setDefault("num-network-channels")
+		setDefault("network")
+	}()
+
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if want := "some-path"; c.RootDir != want {
+		t.Errorf("RootDir=%v, want: %v", c.RootDir, want)
+	}
+	if want := true; c.Debug != want {
+		t.Errorf("Debug=%v, want: %v", c.Debug, want)
+	}
+	if want := 123; c.NumNetworkChannels != want {
+		t.Errorf("NumNetworkChannels=%v, want: %v", c.NumNetworkChannels, want)
+	}
+	if want := NetworkNone; c.Network != want {
+		t.Errorf("Network=%v, want: %v", c.Network, want)
+	}
+}
+
+func TestToFlags(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.RootDir = "some-path"
+	c.Debug = true
+	c.NumNetworkChannels = 123
+	c.Network = NetworkNone
+
+	flags := c.ToFlags()
+	if len(flags) != 4 {
+		t.Errorf("wrong number of flags set, want: 4, got: %d: %s", len(flags), flags)
+	}
+	t.Logf("Flags: %s", flags)
+	fm := map[string]string{}
+	for _, f := range flags {
+		kv := strings.Split(f, "=")
+		fm[kv[0]] = kv[1]
+	}
+	for name, want := range map[string]string{
+		"--root":                 "some-path",
+		"--debug":                "true",
+		"--num-network-channels": "123",
+		"--network":              "none",
+	} {
+		if got, ok := fm[name]; ok {
+			if got != want {
+				t.Errorf("flag %q, want: %q, got: %q", name, want, got)
+			}
+		} else {
+			t.Errorf("flag %q not set", name)
+		}
+	}
+}
+
+// TestInvalidFlags checks that enum flags fail when value is not in enum set.
+func TestInvalidFlags(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		error string
+	}{
+		{
+			name:  "file-access",
+			error: "invalid file access type",
+		},
+		{
+			name:  "network",
+			error: "invalid network type",
+		},
+		{
+			name:  "qdisc",
+			error: "invalid qdisc",
+		},
+		{
+			name:  "watchdog-action",
+			error: "invalid watchdog action",
+		},
+		{
+			name:  "ref-leak-mode",
+			error: "invalid ref leak mode",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			defer setDefault(tc.name)
+			if err := flag.CommandLine.Lookup(tc.name).Value.Set("invalid"); err == nil || !strings.Contains(err.Error(), tc.error) {
+				t.Errorf("flag.Value.Set(invalid) wrong error reported: %v", err)
+			}
+		})
+	}
+}
+
+func TestValidationFail(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		flags map[string]string
+		error string
+	}{
+		{
+			name: "shared+overlay",
+			flags: map[string]string{
+				"file-access": "shared",
+				"overlay":     "true",
+			},
+			error: "overlay flag is incompatible",
+		},
+		{
+			name: "network-channels",
+			flags: map[string]string{
+				"num-network-channels": "-1",
+			},
+			error: "num_network_channels must be > 0",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			for name, val := range tc.flags {
+				defer setDefault(name)
+				if err := flag.CommandLine.Lookup(name).Value.Set(val); err != nil {
+					t.Errorf("%s=%q: %v", name, val, err)
+				}
+			}
+			if _, err := NewFromFlags(); err == nil || !strings.Contains(err.Error(), tc.error) {
+				t.Errorf("NewFromFlags() wrong error reported: %v", err)
+			}
+		})
+	}
+}
+
+func TestOverride(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.AllowFlagOverride = true
+
+	t.Run("string", func(t *testing.T) {
+		c.RootDir = "foobar"
+		if err := c.Override("root", "bar"); err != nil {
+			t.Fatalf("Override(root, bar) failed: %v", err)
+		}
+		defer setDefault("root")
+		if c.RootDir != "bar" {
+			t.Errorf("Override(root, bar) didn't work: %+v", c)
+		}
+	})
+
+	t.Run("bool", func(t *testing.T) {
+		c.Debug = true
+		if err := c.Override("debug", "false"); err != nil {
+			t.Fatalf("Override(debug, false) failed: %v", err)
+		}
+		defer setDefault("debug")
+		if c.Debug {
+			t.Errorf("Override(debug, false) didn't work: %+v", c)
+		}
+	})
+
+	t.Run("enum", func(t *testing.T) {
+		c.FileAccess = FileAccessShared
+		if err := c.Override("file-access", "exclusive"); err != nil {
+			t.Fatalf("Override(file-access, exclusive) failed: %v", err)
+		}
+		defer setDefault("file-access")
+		if c.FileAccess != FileAccessExclusive {
+			t.Errorf("Override(file-access, exclusive) didn't work: %+v", c)
+		}
+	})
+}
+
+func TestOverrideDisabled(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	const errMsg = "flag override disabled"
+	if err := c.Override("root", "path"); err == nil || !strings.Contains(err.Error(), errMsg) {
+		t.Errorf("Override() wrong error: %v", err)
+	}
+}
+
+func TestOverrideError(t *testing.T) {
+	c, err := NewFromFlags()
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.AllowFlagOverride = true
+	for _, tc := range []struct {
+		name  string
+		value string
+		error string
+	}{
+		{
+			name:  "invalid",
+			value: "valid",
+			error: `flag "invalid" not found`,
+		},
+		{
+			name:  "debug",
+			value: "invalid",
+			error: "error setting flag debug",
+		},
+		{
+			name:  "file-access",
+			value: "invalid",
+			error: "invalid file access type",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			if err := c.Override(tc.name, tc.value); err == nil || !strings.Contains(err.Error(), tc.error) {
+				t.Errorf("Override(%q, %q) wrong error: %v", tc.name, tc.value, err)
+			}
+		})
+	}
+}
diff --git a/runsc/config/flags.go b/runsc/config/flags.go
new file mode 100644
index 000000000..d3203b565
--- /dev/null
+++ b/runsc/config/flags.go
@@ -0,0 +1,206 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package config
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"reflect"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+var registration sync.Once
+
+// This is the set of flags used to populate Config.
+func RegisterFlags() {
+	registration.Do(func() {
+		// Although these flags are not part of the OCI spec, they are used by
+		// Docker, and thus should not be changed.
+		flag.String("root", "", "root directory for storage of container state.")
+		flag.String("log", "", "file path where internal debug information is written, default is stdout.")
+		flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
+		flag.Bool("debug", false, "enable debug logging.")
+
+		// These flags are unique to runsc, and are used to configure parts of the
+		// system that are not covered by the runtime spec.
+
+		// Debugging flags.
+		flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+		flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
+		flag.Bool("log-packets", false, "enable network packet logging.")
+		flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
+		flag.Bool("alsologtostderr", false, "send log messages to stderr.")
+		flag.Bool("allow-flag-override", false, "allow OCI annotations (dev.gvisor.flag.<name>) to override flags for debugging.")
+		flag.String("traceback", "system", "golang runtime's traceback level")
+
+		// Debugging flags: strace related
+		flag.Bool("strace", false, "enable strace.")
+		flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+		flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
+
+		// Flags that control sandbox runtime behavior.
+		flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
+		flag.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.")
+		flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+		flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+		flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
+		flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+		flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+		flag.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.")
+
+		// Flags that control sandbox runtime behavior: FS related.
+		flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+		flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+		flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
+		flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
+		flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
+		flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
+
+		// Flags that control sandbox runtime behavior: network related.
+		flag.Var(networkTypePtr(NetworkSandbox), "network", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+		flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+		flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
+		flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.")
+		flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.")
+		flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.")
+		flag.Var(queueingDisciplinePtr(QDiscFIFO), "qdisc", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
+		flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
+
+		// Test flags, not to be used outside tests, ever.
+		flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
+		flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+	})
+}
+
+// NewFromFlags creates a new Config with values coming from command line flags.
+func NewFromFlags() (*Config, error) {
+	conf := &Config{}
+
+	obj := reflect.ValueOf(conf).Elem()
+	st := obj.Type()
+	for i := 0; i < st.NumField(); i++ {
+		f := st.Field(i)
+		name, ok := f.Tag.Lookup("flag")
+		if !ok {
+			// No flag set for this field.
+			continue
+		}
+		fl := flag.CommandLine.Lookup(name)
+		if fl == nil {
+			panic(fmt.Sprintf("Flag %q not found", name))
+		}
+		x := reflect.ValueOf(flag.Get(fl.Value))
+		obj.Field(i).Set(x)
+	}
+
+	if len(conf.RootDir) == 0 {
+		// If not set, set default root dir to something (hopefully) user-writeable.
+		conf.RootDir = "/var/run/runsc"
+		if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+			conf.RootDir = filepath.Join(runtimeDir, "runsc")
+		}
+	}
+
+	if err := conf.validate(); err != nil {
+		return nil, err
+	}
+	return conf, nil
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+	var rv []string
+
+	obj := reflect.ValueOf(c).Elem()
+	st := obj.Type()
+	for i := 0; i < st.NumField(); i++ {
+		f := st.Field(i)
+		name, ok := f.Tag.Lookup("flag")
+		if !ok {
+			// No flag set for this field.
+			continue
+		}
+		val := getVal(obj.Field(i))
+
+		flag := flag.CommandLine.Lookup(name)
+		if flag == nil {
+			panic(fmt.Sprintf("Flag %q not found", name))
+		}
+		if val == flag.DefValue {
+			continue
+		}
+		rv = append(rv, fmt.Sprintf("--%s=%s", flag.Name, val))
+	}
+	return rv
+}
+
+// Override writes a new value to a flag.
+func (c *Config) Override(name string, value string) error {
+	if !c.AllowFlagOverride {
+		return fmt.Errorf("flag override disabled, use --allow-flag-override to enable it")
+	}
+
+	obj := reflect.ValueOf(c).Elem()
+	st := obj.Type()
+	for i := 0; i < st.NumField(); i++ {
+		f := st.Field(i)
+		fieldName, ok := f.Tag.Lookup("flag")
+		if !ok || fieldName != name {
+			// Not a flag field, or flag name doesn't match.
+			continue
+		}
+		fl := flag.CommandLine.Lookup(name)
+		if fl == nil {
+			// Flag must exist if there is a field match above.
+			panic(fmt.Sprintf("Flag %q not found", name))
+		}
+
+		// Use flag to convert the string value to the underlying flag type, using
+		// the same rules as the command-line for consistency.
+		if err := fl.Value.Set(value); err != nil {
+			return fmt.Errorf("error setting flag %s=%q: %w", name, value, err)
+		}
+		x := reflect.ValueOf(flag.Get(fl.Value))
+		obj.Field(i).Set(x)
+
+		// Validates the config again to ensure it's left in a consistent state.
+		return c.validate()
+	}
+	return fmt.Errorf("flag %q not found. Cannot set it to %q", name, value)
+}
+
+func getVal(field reflect.Value) string {
+	if str, ok := field.Addr().Interface().(fmt.Stringer); ok {
+		return str.String()
+	}
+	switch field.Kind() {
+	case reflect.Bool:
+		return strconv.FormatBool(field.Bool())
+	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
+		return strconv.FormatInt(field.Int(), 10)
+	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
+		return strconv.FormatUint(field.Uint(), 10)
+	case reflect.String:
+		return field.String()
+	default:
+		panic("unknown type " + field.Kind().String())
+	}
+}
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 64b23639a..dbb88e117 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -24,11 +24,11 @@ import (
 	"golang.org/x/sys/unix"
 )
 
-// NewWithSocket creates pty master/slave pair, sends the master FD over the given
-// socket, and returns the slave.
+// NewWithSocket creates pty master/replica pair, sends the master FD over the given
+// socket, and returns the replica.
 func NewWithSocket(socketPath string) (*os.File, error) {
-	// Create a new pty master and slave.
-	ptyMaster, ptySlave, err := pty.Open()
+	// Create a new pty master and replica.
+	ptyMaster, ptyReplica, err := pty.Open()
 	if err != nil {
 		return nil, fmt.Errorf("opening pty: %v", err)
 	}
@@ -37,18 +37,18 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	// Get a connection to the socket path.
 	conn, err := net.Dial("unix", socketPath)
 	if err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err)
 	}
 	defer conn.Close()
 	uc, ok := conn.(*net.UnixConn)
 	if !ok {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
 	}
 	socket, err := uc.File()
 	if err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err)
 	}
 	defer socket.Close()
@@ -56,8 +56,8 @@ func NewWithSocket(socketPath string) (*os.File, error) {
 	// Send the master FD over the connection.
 	msg := unix.UnixRights(int(ptyMaster.Fd()))
 	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
-		ptySlave.Close()
+		ptyReplica.Close()
 		return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err)
 	}
-	return ptySlave, nil
+	return ptyReplica, nil
 }
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 49cfb0837..c33755482 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -23,11 +23,12 @@ go_library(
         "//pkg/sync",
         "//runsc/boot",
         "//runsc/cgroup",
+        "//runsc/config",
         "//runsc/sandbox",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_gofrs_flock//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
     ],
 )
 
@@ -65,10 +66,11 @@ go_test(
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/boot/platforms",
+        "//runsc/config",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 3813c6b93..4228399b8 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -122,6 +122,7 @@ func TestConsoleSocket(t *testing.T) {
 	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			spec := testutil.NewSpecWithArgs("true")
+			spec.Process.Terminal = true
 			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
@@ -184,14 +185,14 @@ func TestJobControlSignalExec(t *testing.T) {
 		t.Fatalf("error starting container: %v", err)
 	}
 
-	// Create a pty master/slave. The slave will be passed to the exec
+	// Create a pty master/replica. The replica will be passed to the exec
 	// process.
-	ptyMaster, ptySlave, err := pty.Open()
+	ptyMaster, ptyReplica, err := pty.Open()
 	if err != nil {
 		t.Fatalf("error opening pty: %v", err)
 	}
 	defer ptyMaster.Close()
-	defer ptySlave.Close()
+	defer ptyReplica.Close()
 
 	// Exec bash and attach a terminal. Note that occasionally /bin/sh
 	// may be a different shell or have a different configuration (such
@@ -202,9 +203,9 @@ func TestJobControlSignalExec(t *testing.T) {
 		// Don't let bash execute from profile or rc files, otherwise
 		// our PID counts get messed up.
 		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
-		// Pass the pty slave as FD 0, 1, and 2.
+		// Pass the pty replica as FD 0, 1, and 2.
 		FilePayload: urpc.FilePayload{
-			Files: []*os.File{ptySlave, ptySlave, ptySlave},
+			Files: []*os.File{ptyReplica, ptyReplica, ptyReplica},
 		},
 		StdioIsPty: true,
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 6d297d0df..52e1755ce 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/sighandling"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/sandbox"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -269,7 +270,7 @@ type Args struct {
 // New creates the container in a new Sandbox process, unless the metadata
 // indicates that an existing Sandbox should be used. The caller must call
 // Destroy() on the container.
-func New(conf *boot.Config, args Args) (*Container, error) {
+func New(conf *config.Config, args Args) (*Container, error) {
 	log.Debugf("Create container %q in root dir: %s", args.ID, conf.RootDir)
 	if err := validateID(args.ID); err != nil {
 		return nil, err
@@ -311,6 +312,14 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	if isRoot(args.Spec) {
 		log.Debugf("Creating new sandbox for container %q", args.ID)
 
+		if args.Spec.Linux == nil {
+			args.Spec.Linux = &specs.Linux{}
+		}
+		// Don't force the use of cgroups in tests because they lack permission to do so.
+		if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+			args.Spec.Linux.CgroupsPath = "/" + args.ID
+		}
+
 		// Create and join cgroup before processes are created to ensure they are
 		// part of the cgroup from the start (and all their children processes).
 		cg, err := cgroup.New(args.Spec)
@@ -320,11 +329,17 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 		if cg != nil {
 			// If there is cgroup config, install it before creating sandbox process.
 			if err := cg.Install(args.Spec.Linux.Resources); err != nil {
-				return nil, fmt.Errorf("configuring cgroup: %v", err)
+				switch {
+				case errors.Is(err, syscall.EACCES) && conf.Rootless:
+					log.Warningf("Skipping cgroup configuration in rootless mode: %v", err)
+					cg = nil
+				default:
+					return nil, fmt.Errorf("configuring cgroup: %v", err)
+				}
 			}
 		}
 		if err := runInCgroup(cg, func() error {
-			ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir)
+			ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached)
 			if err != nil {
 				return err
 			}
@@ -397,7 +412,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 }
 
 // Start starts running the containerized process inside the sandbox.
-func (c *Container) Start(conf *boot.Config) error {
+func (c *Container) Start(conf *config.Config) error {
 	log.Debugf("Start container %q", c.ID)
 
 	if err := c.Saver.lock(); err != nil {
@@ -427,7 +442,7 @@ func (c *Container) Start(conf *boot.Config) error {
 		// the start (and all their children processes).
 		if err := runInCgroup(c.Sandbox.Cgroup, func() error {
 			// Create the gofer process.
-			ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+			ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false)
 			if err != nil {
 				return err
 			}
@@ -472,7 +487,7 @@ func (c *Container) Start(conf *boot.Config) error {
 
 // Restore takes a container and replaces its kernel and file system
 // to restore a container from its state file.
-func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
+func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile string) error {
 	log.Debugf("Restore container %q", c.ID)
 	if err := c.Saver.lock(); err != nil {
 		return err
@@ -499,7 +514,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 }
 
 // Run is a helper that calls Create + Start + Wait.
-func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
+func Run(conf *config.Config, args Args) (syscall.WaitStatus, error) {
 	log.Debugf("Run container %q in root dir: %s", args.ID, conf.RootDir)
 	c, err := New(conf, args)
 	if err != nil {
@@ -861,7 +876,7 @@ func (c *Container) waitForStopped() error {
 	return backoff.Retry(op, b)
 }
 
-func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) {
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) {
 	// Start with the general config flags.
 	args := conf.ToFlags()
 
@@ -901,9 +916,6 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	args = append(args, "gofer", "--bundle", bundleDir)
-	if conf.Overlay {
-		args = append(args, "--panic-on-write=true")
-	}
 
 	// Open the spec file to donate to the sandbox.
 	specFile, err := specutils.OpenSpec(bundleDir)
@@ -955,6 +967,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	cmd.ExtraFiles = goferEnds
 	cmd.Args[0] = "runsc-gofer"
 
+	if attached {
+		// The gofer is attached to the lifetime of this process, so it
+		// should synchronously die when this process dies.
+		cmd.SysProcAttr = &syscall.SysProcAttr{
+			Pdeathsig: syscall.SIGKILL,
+		}
+	}
+
 	// Enter new namespaces to isolate from the rest of the system. Don't unshare
 	// cgroup because gofer is added to a cgroup in the caller's namespace.
 	nss := []specs.LinuxNamespace{
@@ -979,7 +999,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	// Start the gofer in the given namespace.
 	log.Debugf("Starting gofer: %s %v", binPath, args)
 	if err := specutils.StartInNS(cmd, nss); err != nil {
-		return nil, nil, fmt.Errorf("Gofer: %v", err)
+		return nil, nil, fmt.Errorf("gofer: %v", err)
 	}
 	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
 	c.GoferPid = cmd.Process.Pid
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index e7715b6f7..cc188f45b 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"math"
 	"os"
 	"path"
 	"path/filepath"
@@ -40,8 +41,9 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -53,9 +55,8 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
 			err = fmt.Errorf("error getting process data from container: %v", err)
 			return &backoff.PermanentError{Err: err}
 		}
-		if r, err := procListsEqual(got, want); !r {
-			return fmt.Errorf("container got process list: %s, want: %s: error: %v",
-				procListToString(got), procListToString(want), err)
+		if !procListsEqual(got, want) {
+			return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
 		}
 		return nil
 	}
@@ -92,36 +93,72 @@ func blockUntilWaitable(pid int) error {
 	return err
 }
 
-// procListsEqual is used to check whether 2 Process lists are equal for all
-// implemented fields.
-func procListsEqual(got, want []*control.Process) (bool, error) {
-	if len(got) != len(want) {
-		return false, nil
-	}
-	for i := range got {
-		pd1 := got[i]
-		pd2 := want[i]
-		// Zero out timing dependant fields.
-		pd1.Time = ""
-		pd1.STime = ""
-		pd1.C = 0
-		// Ignore TTY field too, since it's not relevant in the cases
-		// where we use this method. Tests that care about the TTY
-		// field should check for it themselves.
-		pd1.TTY = ""
-		pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
-		if err != nil {
-			return false, err
+// procListsEqual is used to check whether 2 Process lists are equal. Fields
+// set to -1 in wants are ignored. Timestamp and threads fields are always
+// ignored.
+func procListsEqual(gots, wants []*control.Process) bool {
+	if len(gots) != len(wants) {
+		return false
+	}
+	for i := range gots {
+		got := gots[i]
+		want := wants[i]
+
+		if want.UID != math.MaxUint32 && want.UID != got.UID {
+			return false
 		}
-		pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
-		if err != nil {
-			return false, err
+		if want.PID != -1 && want.PID != got.PID {
+			return false
 		}
-		if pd1Json != pd2Json {
-			return false, nil
+		if want.PPID != -1 && want.PPID != got.PPID {
+			return false
+		}
+		if len(want.TTY) != 0 && want.TTY != got.TTY {
+			return false
+		}
+		if len(want.Cmd) != 0 && want.Cmd != got.Cmd {
+			return false
 		}
 	}
-	return true, nil
+	return true
+}
+
+type processBuilder struct {
+	process control.Process
+}
+
+func newProcessBuilder() *processBuilder {
+	return &processBuilder{
+		process: control.Process{
+			UID:  math.MaxUint32,
+			PID:  -1,
+			PPID: -1,
+		},
+	}
+}
+
+func (p *processBuilder) Cmd(cmd string) *processBuilder {
+	p.process.Cmd = cmd
+	return p
+}
+
+func (p *processBuilder) PID(pid kernel.ThreadID) *processBuilder {
+	p.process.PID = pid
+	return p
+}
+
+func (p *processBuilder) PPID(ppid kernel.ThreadID) *processBuilder {
+	p.process.PPID = ppid
+	return p
+}
+
+func (p *processBuilder) UID(uid auth.KUID) *processBuilder {
+	p.process.UID = uid
+	return p
+}
+
+func (p *processBuilder) Process() *control.Process {
+	return &p.process
 }
 
 func procListToString(pl []*control.Process) string {
@@ -214,7 +251,7 @@ func readOutputNum(file string, position int) (int, error) {
 
 // run starts the sandbox and waits for it to exit, checking that the
 // application succeeded.
-func run(spec *specs.Spec, conf *boot.Config) error {
+func run(spec *specs.Spec, conf *config.Config) error {
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
 		return fmt.Errorf("error setting up container: %v", err)
@@ -253,26 +290,24 @@ var (
 )
 
 // configs generates different configurations to run tests.
-func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
+func configs(t *testing.T, opts ...configOption) map[string]*config.Config {
 	// Always load the default config.
-	cs := make(map[string]*boot.Config)
+	cs := make(map[string]*config.Config)
+	testutil.TestConfig(t)
 	for _, o := range opts {
+		c := testutil.TestConfig(t)
 		switch o {
 		case overlay:
-			c := testutil.TestConfig(t)
 			c.Overlay = true
 			cs["overlay"] = c
 		case ptrace:
-			c := testutil.TestConfig(t)
 			c.Platform = platforms.Ptrace
 			cs["ptrace"] = c
 		case kvm:
-			c := testutil.TestConfig(t)
 			c.Platform = platforms.KVM
 			cs["kvm"] = c
 		case nonExclusiveFS:
-			c := testutil.TestConfig(t)
-			c.FileAccess = boot.FileAccessShared
+			c.FileAccess = config.FileAccessShared
 			cs["non-exclusive"] = c
 		default:
 			panic(fmt.Sprintf("unknown config option %v", o))
@@ -281,23 +316,14 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 	return cs
 }
 
-func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
-	vfs1 := configs(t, opts...)
-
-	var optsVFS2 []configOption
-	for _, opt := range opts {
-		// TODO(gvisor.dev/issue/1487): Enable overlay tests.
-		if opt != overlay {
-			optsVFS2 = append(optsVFS2, opt)
-		}
-	}
-
-	for key, value := range configs(t, optsVFS2...) {
+// TODO(gvisor.dev/issue/1624): Merge with configs when VFS2 is the default.
+func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config {
+	all := configs(t, opts...)
+	for key, value := range configs(t, opts...) {
 		value.VFS2 = true
-		vfs1[key+"VFS2"] = value
+		all[key+"VFS2"] = value
 	}
-
-	return vfs1
+	return all
 }
 
 // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
@@ -323,14 +349,7 @@ func TestLifecycle(t *testing.T) {
 
 			// expectedPL lists the expected process state of the container.
 			expectedPL := []*control.Process{
-				{
-					UID:     0,
-					PID:     1,
-					PPID:    0,
-					C:       0,
-					Cmd:     "sleep",
-					Threads: []kernel.ThreadID{1},
-				},
+				newProcessBuilder().Cmd("sleep").Process(),
 			}
 			// Create the container.
 			args := Args{
@@ -483,7 +502,7 @@ func TestExePath(t *testing.T) {
 		t.Fatalf("error making directory: %v", err)
 	}
 
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			for _, test := range []struct {
 				path    string
@@ -608,10 +627,16 @@ func doAppExitStatus(t *testing.T, vfs2 bool) {
 
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
-			const uid = 343
-			spec := testutil.NewSpecWithArgs("sleep", "100")
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "exec-test")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			// Note that some shells may exec the final command in a sequence as
+			// an optimization. We avoid this here by adding the exit 0.
+			cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100 && exit 0", dir)
+			spec := testutil.NewSpecWithArgs("sh", "-c", cmd)
 
 			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
@@ -634,29 +659,127 @@ func TestExec(t *testing.T) {
 				t.Fatalf("error starting container: %v", err)
 			}
 
-			// expectedPL lists the expected process state of the container.
+			// Wait until sleep is running to ensure the symlink was created.
 			expectedPL := []*control.Process{
+				newProcessBuilder().Cmd("sh").Process(),
+				newProcessBuilder().Cmd("sleep").Process(),
+			}
+			if err := waitForProcessList(cont, expectedPL); err != nil {
+				t.Fatalf("waitForProcessList: %v", err)
+			}
+
+			for _, tc := range []struct {
+				name string
+				args control.ExecArgs
+			}{
 				{
-					UID:     0,
-					PID:     1,
-					PPID:    0,
-					C:       0,
-					Cmd:     "sleep",
-					Threads: []kernel.ThreadID{1},
+					name: "complete",
+					args: control.ExecArgs{
+						Filename: "/bin/true",
+						Argv:     []string{"/bin/true"},
+					},
 				},
 				{
-					UID:     uid,
-					PID:     2,
-					PPID:    0,
-					C:       0,
-					Cmd:     "sleep",
-					Threads: []kernel.ThreadID{2},
+					name: "filename",
+					args: control.ExecArgs{
+						Filename: "/bin/true",
+					},
 				},
+				{
+					name: "argv",
+					args: control.ExecArgs{
+						Argv: []string{"/bin/true"},
+					},
+				},
+				{
+					name: "filename resolution",
+					args: control.ExecArgs{
+						Filename: "true",
+						Envv:     []string{"PATH=/bin"},
+					},
+				},
+				{
+					name: "argv resolution",
+					args: control.ExecArgs{
+						Argv: []string{"true"},
+						Envv: []string{"PATH=/bin"},
+					},
+				},
+				{
+					name: "argv symlink",
+					args: control.ExecArgs{
+						Argv: []string{filepath.Join(dir, "symlink")},
+					},
+				},
+				{
+					name: "working dir",
+					args: control.ExecArgs{
+						Argv:             []string{"/bin/sh", "-c", `if [[ "${PWD}" != "/tmp" ]]; then exit 1; fi`},
+						WorkingDirectory: "/tmp",
+					},
+				},
+				{
+					name: "user",
+					args: control.ExecArgs{
+						Argv: []string{"/bin/sh", "-c", `if [[ "$(id -u)" != "343" ]]; then exit 1; fi`},
+						KUID: 343,
+					},
+				},
+				{
+					name: "group",
+					args: control.ExecArgs{
+						Argv: []string{"/bin/sh", "-c", `if [[ "$(id -g)" != "343" ]]; then exit 1; fi`},
+						KGID: 343,
+					},
+				},
+				{
+					name: "env",
+					args: control.ExecArgs{
+						Argv: []string{"/bin/sh", "-c", `if [[ "${FOO}" != "123" ]]; then exit 1; fi`},
+						Envv: []string{"FOO=123"},
+					},
+				},
+			} {
+				t.Run(tc.name, func(t *testing.T) {
+					// t.Parallel()
+					if ws, err := cont.executeSync(&tc.args); err != nil {
+						t.Fatalf("executeAsync(%+v): %v", tc.args, err)
+					} else if ws != 0 {
+						t.Fatalf("executeAsync(%+v) failed with exit: %v", tc.args, ws)
+					}
+				})
 			}
+		})
+	}
+}
 
-			// Verify that "sleep 100" is running.
-			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
-				t.Error(err)
+// TestExecProcList verifies that a container can exec a new program and it
+// shows correcly in the process list.
+func TestExecProcList(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			const uid = 343
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
 			}
 
 			execArgs := &control.ExecArgs{
@@ -666,9 +789,8 @@ func TestExec(t *testing.T) {
 				KUID:             uid,
 			}
 
-			// Verify that "sleep 100" and "sleep 5" are running
-			// after exec.  First, start running exec (whick
-			// blocks).
+			// Verify that "sleep 100" and "sleep 5" are running after exec. First,
+			// start running exec (which blocks).
 			ch := make(chan error)
 			go func() {
 				exitStatus, err := cont.executeSync(execArgs)
@@ -681,6 +803,11 @@ func TestExec(t *testing.T) {
 				}
 			}()
 
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				newProcessBuilder().PID(1).PPID(0).Cmd("sleep").UID(0).Process(),
+				newProcessBuilder().PID(2).PPID(0).Cmd("sleep").UID(uid).Process(),
+			}
 			if err := waitForProcessList(cont, expectedPL); err != nil {
 				t.Fatalf("error waiting for processes: %v", err)
 			}
@@ -700,7 +827,7 @@ func TestExec(t *testing.T) {
 
 // TestKillPid verifies that we can signal individual exec'd processes.
 func TestKillPid(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			app, err := testutil.FindFile("test/cmd/test_app/test_app")
 			if err != nil {
@@ -768,13 +895,15 @@ func TestKillPid(t *testing.T) {
 	}
 }
 
-// TestCheckpointRestore creates a container that continuously writes successive integers
-// to a file. To test checkpoint and restore functionality, the container is
-// checkpointed and the last number printed to the file is recorded. Then, it is restored in two
-// new containers and the first number printed from these containers is checked. Both should
-// be the next consecutive number after the last number from the checkpointed container.
+// TestCheckpointRestore creates a container that continuously writes successive
+// integers to a file. To test checkpoint and restore functionality, the
+// container is checkpointed and the last number printed to the file is
+// recorded. Then, it is restored in two new containers and the first number
+// printed from these containers is checked. Both should be the next consecutive
+// number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
+	// TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
 	for name, conf := range configs(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
@@ -936,6 +1065,7 @@ func TestCheckpointRestore(t *testing.T) {
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
+	// TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
 	for name, conf := range configs(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			// UDS path is limited to 108 chars for compatibility with older systems.
@@ -1073,7 +1203,7 @@ func TestUnixDomainSockets(t *testing.T) {
 // recreated. Then it resumes the container, verify that the file gets created
 // again.
 func TestPauseResume(t *testing.T) {
-	for name, conf := range configs(t, noOverlay...) {
+	for name, conf := range configsWithVFS2(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
 			if err != nil {
@@ -1242,24 +1372,9 @@ func TestCapabilities(t *testing.T) {
 
 			// expectedPL lists the expected process state of the container.
 			expectedPL := []*control.Process{
-				{
-					UID:     0,
-					PID:     1,
-					PPID:    0,
-					C:       0,
-					Cmd:     "sleep",
-					Threads: []kernel.ThreadID{1},
-				},
-				{
-					UID:     uid,
-					PID:     2,
-					PPID:    0,
-					C:       0,
-					Cmd:     "exe",
-					Threads: []kernel.ThreadID{2},
-				},
+				newProcessBuilder().Cmd("sleep").Process(),
 			}
-			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+			if err := waitForProcessList(cont, expectedPL); err != nil {
 				t.Fatalf("Failed to wait for sleep to start, err: %v", err)
 			}
 
@@ -1348,7 +1463,7 @@ func TestRunNonRoot(t *testing.T) {
 // TestMountNewDir checks that runsc will create destination directory if it
 // doesn't exit.
 func TestMountNewDir(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			root, err := ioutil.TempDir(testutil.TmpDir(), "root")
 			if err != nil {
@@ -1368,6 +1483,8 @@ func TestMountNewDir(t *testing.T) {
 				Source:      srcDir,
 				Type:        "bind",
 			})
+			// Extra points for creating the mount with a readonly root.
+			spec.Root.Readonly = true
 
 			if err := run(spec, conf); err != nil {
 				t.Fatalf("error running sandbox: %v", err)
@@ -1377,17 +1494,17 @@ func TestMountNewDir(t *testing.T) {
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
-			spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+			spec := testutil.NewSpecWithArgs("sleep", "100")
 			spec.Root.Readonly = true
+
 			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
 			defer cleanup()
 
-			// Create, start and wait for the container.
 			args := Args{
 				ID:        testutil.RandomContainerID(),
 				Spec:      spec,
@@ -1402,12 +1519,82 @@ func TestReadonlyRoot(t *testing.T) {
 				t.Fatalf("error starting container: %v", err)
 			}
 
-			ws, err := c.Wait()
+			// Read mounts to check that root is readonly.
+			out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / '")
+			if err != nil || ws != 0 {
+				t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+			}
+			t.Logf("root mount: %q", out)
+			if !strings.Contains(string(out), "(ro)") {
+				t.Errorf("root not mounted readonly: %q", out)
+			}
+
+			// Check that file cannot be created.
+			ws, err = execute(c, "/bin/touch", "/foo")
 			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
+				t.Fatalf("touch file in ro mount: %v", err)
 			}
 			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-				t.Fatalf("container failed, waitStatus: %v", ws)
+				t.Fatalf("wrong waitStatus: %v", ws)
+			}
+		})
+	}
+}
+
+func TestReadonlyMount(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      dir,
+				Type:        "bind",
+				Options:     []string{"ro"},
+			})
+			spec.Root.Readonly = false
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Read mounts to check that volume is readonly.
+			cmd := fmt.Sprintf("mount | grep ' %s '", dir)
+			out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd)
+			if err != nil || ws != 0 {
+				t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+			}
+			t.Logf("mount: %q", out)
+			if !strings.Contains(string(out), "(ro)") {
+				t.Errorf("volume not mounted readonly: %q", out)
+			}
+
+			// Check that file cannot be created.
+			ws, err = execute(c, "/bin/touch", path.Join(dir, "file"))
+			if err != nil {
+				t.Fatalf("touch file in ro mount: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("wrong WaitStatus: %v", ws)
 			}
 		})
 	}
@@ -1494,54 +1681,6 @@ func TestUIDMap(t *testing.T) {
 	}
 }
 
-func TestReadonlyMount(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
-		t.Run(name, func(t *testing.T) {
-			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
-			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-			if err != nil {
-				t.Fatalf("ioutil.TempDir() failed: %v", err)
-			}
-			spec.Mounts = append(spec.Mounts, specs.Mount{
-				Destination: dir,
-				Source:      dir,
-				Type:        "bind",
-				Options:     []string{"ro"},
-			})
-			spec.Root.Readonly = false
-
-			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer cleanup()
-
-			// Create, start and wait for the container.
-			args := Args{
-				ID:        testutil.RandomContainerID(),
-				Spec:      spec,
-				BundleDir: bundleDir,
-			}
-			c, err := New(conf, args)
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer c.Destroy()
-			if err := c.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-
-			ws, err := c.Wait()
-			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
-			}
-			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-				t.Fatalf("container failed, waitStatus: %v", ws)
-			}
-		})
-	}
-}
-
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
@@ -1708,8 +1847,9 @@ func TestUserLog(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	// sched_rr_get_interval = 148 - not implemented in gvisor.
-	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
+	// sched_rr_get_interval - not implemented in gvisor.
+	num := strconv.Itoa(syscall.SYS_SCHED_RR_GET_INTERVAL)
+	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall="+num)
 	conf := testutil.TestConfig(t)
 	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 	if err != nil {
@@ -1891,7 +2031,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
 }
 
 func TestCreateWorkingDir(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
 			if err != nil {
@@ -1994,27 +2134,19 @@ func TestMountPropagation(t *testing.T) {
 
 	// Check that mount didn't propagate to private mount.
 	privFile := filepath.Join(priv, "mnt", "file")
-	execArgs := &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "!", "-f", privFile},
-	}
-	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+	if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 {
 		t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err)
 	}
 
 	// Check that mount propagated to slave mount.
 	slaveFile := filepath.Join(slave, "mnt", "file")
-	execArgs = &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", slaveFile},
-	}
-	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+	if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 {
 		t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err)
 	}
 }
 
 func TestMountSymlink(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
 			if err != nil {
@@ -2074,11 +2206,7 @@ func TestMountSymlink(t *testing.T) {
 			// Check that symlink was resolved and mount was created where the symlink
 			// is pointing to.
 			file := path.Join(target, "file")
-			execArgs := &control.ExecArgs{
-				Filename: "/usr/bin/test",
-				Argv:     []string{"test", "-f", file},
-			}
-			if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+			if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 {
 				t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
 			}
 		})
@@ -2204,13 +2332,42 @@ func TestTTYField(t *testing.T) {
 	}
 }
 
+func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) {
+	args := &control.ExecArgs{
+		Filename: name,
+		Argv:     append([]string{name}, arg...),
+	}
+	return cont.executeSync(args)
+}
+
+func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, syscall.WaitStatus, error) {
+	r, w, err := os.Pipe()
+	if err != nil {
+		return nil, 0, err
+	}
+	defer r.Close()
+
+	args := &control.ExecArgs{
+		Filename:    name,
+		Argv:        append([]string{name}, arg...),
+		FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}},
+	}
+	ws, err := cont.executeSync(args)
+	w.Close()
+	if err != nil {
+		return nil, 0, err
+	}
+	out, err := ioutil.ReadAll(r)
+	return out, ws, err
+}
+
 // executeSync synchronously executes a new process.
-func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
-	pid, err := cont.Execute(args)
+func (c *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
+	pid, err := c.Execute(args)
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %v", err)
 	}
-	ws, err := cont.WaitPID(pid)
+	ws, err := c.WaitPID(pid)
 	if err != nil {
 		return 0, fmt.Errorf("error waiting: %v", err)
 	}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 207206dd2..850e80290 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -60,7 +61,7 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 	return specs, ids
 }
 
-func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
+func startContainers(conf *config.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
 	if len(conf.RootDir) == 0 {
 		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
@@ -100,19 +101,20 @@ type execDesc struct {
 	c    *Container
 	cmd  []string
 	want int
-	desc string
+	name string
 }
 
-func execMany(execs []execDesc) error {
+func execMany(t *testing.T, execs []execDesc) {
 	for _, exec := range execs {
-		args := &control.ExecArgs{Argv: exec.cmd}
-		if ws, err := exec.c.executeSync(args); err != nil {
-			return fmt.Errorf("error executing %+v: %v", args, err)
-		} else if ws.ExitStatus() != exec.want {
-			return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want)
-		}
+		t.Run(exec.name, func(t *testing.T) {
+			args := &control.ExecArgs{Argv: exec.cmd}
+			if ws, err := exec.c.executeSync(args); err != nil {
+				t.Errorf("error executing %+v: %v", args, err)
+			} else if ws.ExitStatus() != exec.want {
+				t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want)
+			}
+		})
 	}
-	return nil
 }
 
 func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
@@ -149,13 +151,13 @@ func TestMultiContainerSanity(t *testing.T) {
 
 			// Check via ps that multiple processes are running.
 			expectedPL := []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[0], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
 			}
 			expectedPL = []*control.Process{
-				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+				newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[1], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -167,7 +169,7 @@ func TestMultiContainerSanity(t *testing.T) {
 // TestMultiPIDNS checks that it is possible to run 2 dead-simple
 // containers in the same sandbox with different pidns.
 func TestMultiPIDNS(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -195,13 +197,13 @@ func TestMultiPIDNS(t *testing.T) {
 
 			// Check via ps that multiple processes are running.
 			expectedPL := []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[0], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
 			}
 			expectedPL = []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[1], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -212,7 +214,7 @@ func TestMultiPIDNS(t *testing.T) {
 
 // TestMultiPIDNSPath checks the pidns path.
 func TestMultiPIDNSPath(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -257,7 +259,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 
 			// Check via ps that multiple processes are running.
 			expectedPL := []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[0], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -267,7 +269,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 			}
 
 			expectedPL = []*control.Process{
-				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+				newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[1], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -300,7 +302,7 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+		newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -345,7 +347,7 @@ func TestMultiContainerWait(t *testing.T) {
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+		newProcessBuilder().Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -377,7 +379,7 @@ func TestExecWait(t *testing.T) {
 
 	// Check via ps that process is running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+		newProcessBuilder().Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -412,7 +414,7 @@ func TestExecWait(t *testing.T) {
 
 	// Wait for the exec'd process to exit.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+		newProcessBuilder().PID(1).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -478,7 +480,7 @@ func TestMultiContainerMount(t *testing.T) {
 // TestMultiContainerSignal checks that it is possible to signal individual
 // containers without killing the entire sandbox.
 func TestMultiContainerSignal(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -498,9 +500,8 @@ func TestMultiContainerSignal(t *testing.T) {
 
 			// Check via ps that container 1 process is running.
 			expectedPL := []*control.Process{
-				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+				newProcessBuilder().Cmd("sleep").Process(),
 			}
-
 			if err := waitForProcessList(containers[1], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
 			}
@@ -512,7 +513,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 			// Make sure process 1 is still running.
 			expectedPL = []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[0], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -579,7 +580,7 @@ func TestMultiContainerDestroy(t *testing.T) {
 		t.Fatal("error finding test_app:", err)
 	}
 
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -626,8 +627,10 @@ func TestMultiContainerDestroy(t *testing.T) {
 			if err != nil {
 				t.Fatalf("error getting process data from sandbox: %v", err)
 			}
-			expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
-			if r, err := procListsEqual(pss, expectedPL); !r {
+			expectedPL := []*control.Process{
+				newProcessBuilder().PID(1).Cmd("sleep").Process(),
+			}
+			if !procListsEqual(pss, expectedPL) {
 				t.Errorf("container got process list: %s, want: %s: error: %v",
 					procListToString(pss), procListToString(expectedPL), err)
 			}
@@ -664,7 +667,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Check root's container process list doesn't include other containers.
 	expectedPL0 := []*control.Process{
-		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+		newProcessBuilder().PID(1).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -672,8 +675,8 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Same for the other container.
 	expectedPL1 := []*control.Process{
-		{PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
-		{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
+		newProcessBuilder().PID(2).Cmd("sh").Process(),
+		newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -687,7 +690,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 	if _, err := containers[1].Execute(args); err != nil {
 		t.Fatalf("error exec'ing: %v", err)
 	}
-	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
+	expectedPL1 = append(expectedPL1, newProcessBuilder().PID(4).Cmd("sleep").Process())
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
 	}
@@ -1071,7 +1074,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 // Test that pod shared mounts are properly mounted in 2 containers and that
 // changes from one container is reflected in the other.
 func TestMultiContainerSharedMount(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1109,84 +1112,82 @@ func TestMultiContainerSharedMount(t *testing.T) {
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-					desc: "directory is mounted in container0",
+					name: "directory is mounted in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-					desc: "directory is mounted in container1",
+					name: "directory is mounted in container1",
 				},
 				{
 					c:    containers[0],
-					cmd:  []string{"/usr/bin/touch", file0},
-					desc: "create file in container0",
+					cmd:  []string{"/bin/touch", file0},
+					name: "create file in container0",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-f", file0},
-					desc: "file appears in container0",
+					name: "file appears in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-f", file1},
-					desc: "file appears in container1",
+					name: "file appears in container1",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/bin/rm", file1},
-					desc: "file removed from container1",
+					name: "remove file from container1",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
-					desc: "file removed from container0",
+					name: "file removed from container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
-					desc: "file removed from container1",
+					name: "file removed from container1",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/bin/mkdir", file1},
-					desc: "create directory in container1",
+					name: "create directory in container1",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-d", file0},
-					desc: "dir appears in container0",
+					name: "dir appears in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-d", file1},
-					desc: "dir appears in container1",
+					name: "dir appears in container1",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/bin/rmdir", file0},
-					desc: "create directory in container0",
+					name: "remove directory from container0",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "!", "-d", file0},
-					desc: "dir removed from container0",
+					name: "dir removed from container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "!", "-d", file1},
-					desc: "dir removed from container1",
+					name: "dir removed from container1",
 				},
 			}
-			if err := execMany(execs); err != nil {
-				t.Fatal(err.Error())
-			}
+			execMany(t, execs)
 		})
 	}
 }
 
 // Test that pod mounts are mounted as readonly when requested.
 func TestMultiContainerSharedMountReadonly(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1224,36 +1225,34 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-					desc: "directory is mounted in container0",
+					name: "directory is mounted in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-					desc: "directory is mounted in container1",
+					name: "directory is mounted in container1",
 				},
 				{
 					c:    containers[0],
-					cmd:  []string{"/usr/bin/touch", file0},
+					cmd:  []string{"/bin/touch", file0},
 					want: 1,
-					desc: "fails to write to container0",
+					name: "fails to write to container0",
 				},
 				{
 					c:    containers[1],
-					cmd:  []string{"/usr/bin/touch", file1},
+					cmd:  []string{"/bin/touch", file1},
 					want: 1,
-					desc: "fails to write to container1",
+					name: "fails to write to container1",
 				},
 			}
-			if err := execMany(execs); err != nil {
-				t.Fatal(err.Error())
-			}
+			execMany(t, execs)
 		})
 	}
 }
 
 // Test that shared pod mounts continue to work after container is restarted.
 func TestMultiContainerSharedMountRestart(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1290,23 +1289,21 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 			execs := []execDesc{
 				{
 					c:    containers[0],
-					cmd:  []string{"/usr/bin/touch", file0},
-					desc: "create file in container0",
+					cmd:  []string{"/bin/touch", file0},
+					name: "create file in container0",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-f", file0},
-					desc: "file appears in container0",
+					name: "file appears in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-f", file1},
-					desc: "file appears in container1",
+					name: "file appears in container1",
 				},
 			}
-			if err := execMany(execs); err != nil {
-				t.Fatal(err.Error())
-			}
+			execMany(t, execs)
 
 			containers[1].Destroy()
 
@@ -1333,86 +1330,84 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "-f", file0},
-					desc: "file is still in container0",
+					name: "file is still in container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "-f", file1},
-					desc: "file is still in container1",
+					name: "file is still in container1",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/bin/rm", file1},
-					desc: "file removed from container1",
+					name: "file removed from container1",
 				},
 				{
 					c:    containers[0],
 					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
-					desc: "file removed from container0",
+					name: "file removed from container0",
 				},
 				{
 					c:    containers[1],
 					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
-					desc: "file removed from container1",
+					name: "file removed from container1",
 				},
 			}
-			if err := execMany(execs); err != nil {
-				t.Fatal(err.Error())
-			}
+			execMany(t, execs)
 		})
 	}
 }
 
 // Test that unsupported pod mounts options are ignored when matching master and
-// slave mounts.
+// replica mounts.
 func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
-	rootDir, cleanup, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer cleanup()
-
-	conf := testutil.TestConfig(t)
-	conf.RootDir = rootDir
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
 
-	// Setup the containers.
-	sleep := []string{"/bin/sleep", "100"}
-	podSpec, ids := createSpecs(sleep, sleep)
-	mnt0 := specs.Mount{
-		Destination: "/mydir/test",
-		Source:      "/some/dir",
-		Type:        "tmpfs",
-		Options:     []string{"rw", "rbind", "relatime"},
-	}
-	podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+			// Setup the containers.
+			sleep := []string{"/bin/sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     []string{"rw", "rbind", "relatime"},
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
-	mnt1 := mnt0
-	mnt1.Destination = "/mydir2/test2"
-	mnt1.Options = []string{"rw", "nosuid"}
-	podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			mnt1.Options = []string{"rw", "nosuid"}
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
 
-	createSharedMount(mnt0, "test-mount", podSpec...)
+			createSharedMount(mnt0, "test-mount", podSpec...)
 
-	containers, cleanup, err := startContainers(conf, podSpec, ids)
-	if err != nil {
-		t.Fatalf("error starting containers: %v", err)
-	}
-	defer cleanup()
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
 
-	execs := []execDesc{
-		{
-			c:    containers[0],
-			cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
-			desc: "directory is mounted in container0",
-		},
-		{
-			c:    containers[1],
-			cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
-			desc: "directory is mounted in container1",
-		},
-	}
-	if err := execMany(execs); err != nil {
-		t.Fatal(err.Error())
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+					name: "directory is mounted in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+					name: "directory is mounted in container1",
+				},
+			}
+			execMany(t, execs)
+		})
 	}
 }
 
@@ -1505,7 +1500,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Ensure container is running
 	c := containers[2]
 	expectedPL := []*control.Process{
-		{PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
+		newProcessBuilder().PID(3).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1522,8 +1517,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	}
 
 	// Check that container isn't running anymore.
-	args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-	if _, err := c.executeSync(args); err == nil {
+	if _, err := execute(c, "/bin/true"); err == nil {
 		t.Fatalf("Container %q was not stopped after gofer death", c.ID)
 	}
 
@@ -1533,13 +1527,12 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 			continue // container[2] has been killed.
 		}
 		pl := []*control.Process{
-			{PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
+			newProcessBuilder().PID(kernel.ThreadID(i + 1)).Cmd("sleep").Process(),
 		}
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
 		}
-		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-		if _, err := c.executeSync(args); err != nil {
+		if _, err := execute(c, "/bin/true"); err != nil {
 			t.Fatalf("Container %q was affected by another container: %v", c.ID, err)
 		}
 	}
@@ -1553,7 +1546,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
 	// and RPC errors out.
 	impossiblePL := []*control.Process{
-		{PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
+		newProcessBuilder().Cmd("non-existent-process").Process(),
 	}
 	if err := waitForProcessList(c, impossiblePL); err == nil {
 		t.Fatalf("Sandbox was not killed after gofer death")
@@ -1561,8 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 
 	// Check that entire sandbox isn't running anymore.
 	for _, c := range containers {
-		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-		if _, err := c.executeSync(args); err == nil {
+		if _, err := execute(c, "/bin/true"); err == nil {
 			t.Fatalf("Container %q was not stopped after gofer death", c.ID)
 		}
 	}
@@ -1697,3 +1689,80 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 		t.Fatalf("child container failed, waitStatus: %v", ws)
 	}
 }
+
+// TestMultiContainerHomeEnvDir tests that the HOME environment variable is set
+// for root containers, sub-containers, and exec'ed processes.
+func TestMultiContainerHomeEnvDir(t *testing.T) {
+	// NOTE: Don't use overlay since we need changes to persist to the temp dir
+	// outside the sandbox.
+	for testName, conf := range configsWithVFS2(t, noOverlay...) {
+		t.Run(testName, func(t *testing.T) {
+
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Create temp files we can write the value of $HOME to.
+			homeDirs := map[string]*os.File{}
+			for _, name := range []string{"root", "sub", "exec"} {
+				homeFile, err := ioutil.TempFile(testutil.TmpDir(), name)
+				if err != nil {
+					t.Fatalf("creating temp file: %v", err)
+				}
+				homeDirs[name] = homeFile
+			}
+
+			// We will sleep in the root container in order to ensure that the root
+			//container doesn't terminate before sub containers can be created.
+			rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s; sleep 1000`, homeDirs["root"].Name())}
+			subCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["sub"].Name())}
+			execCmd := fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["exec"].Name())
+
+			// Setup the containers, a root container and sub container.
+			specConfig, ids := createSpecs(rootCmd, subCmd)
+			containers, cleanup, err := startContainers(conf, specConfig, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			// Exec into the root container synchronously.
+			if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil {
+				t.Errorf("error executing %+v: %v", execCmd, err)
+			}
+
+			// Wait for the subcontainer to finish.
+			_, err = containers[1].Wait()
+			if err != nil {
+				t.Errorf("wait on child container: %v", err)
+			}
+
+			// Wait for the root container to run.
+			expectedPL := []*control.Process{
+				newProcessBuilder().Cmd("sh").Process(),
+				newProcessBuilder().Cmd("sleep").Process(),
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+
+			// Check the written files.
+			for name, tmpFile := range homeDirs {
+				dirBytes, err := ioutil.ReadAll(tmpFile)
+				if err != nil {
+					t.Fatalf("reading %s temp file: %v", name, err)
+				}
+				got := string(dirBytes)
+
+				want := "/"
+				if got != want {
+					t.Errorf("%s $HOME incorrect: got: %q, want: %q", name, got, want)
+				}
+			}
+
+		})
+	}
+}
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index bac177a88..cb5bffb89 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -25,14 +25,14 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 // TestSharedVolume checks that modifications to a volume mount are propagated
 // into and out of the sandbox.
 func TestSharedVolume(t *testing.T) {
 	conf := testutil.TestConfig(t)
-	conf.FileAccess = boot.FileAccessShared
+	conf.FileAccess = config.FileAccessShared
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -168,11 +168,7 @@ func TestSharedVolume(t *testing.T) {
 
 func checkFile(c *Container, filename string, want []byte) error {
 	cpy := filename + ".copy"
-	argsCp := &control.ExecArgs{
-		Filename: "/bin/cp",
-		Argv:     []string{"cp", "-f", filename, cpy},
-	}
-	if _, err := c.executeSync(argsCp); err != nil {
+	if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil {
 		return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err)
 	}
 	got, err := ioutil.ReadFile(cpy)
@@ -189,7 +185,7 @@ func checkFile(c *Container, filename string, want []byte) error {
 // is reflected inside.
 func TestSharedVolumeFile(t *testing.T) {
 	conf := testutil.TestConfig(t)
-	conf.FileAccess = boot.FileAccessShared
+	conf.FileAccess = config.FileAccessShared
 
 	// Main process just sleeps. We will use "exec" to probe the state of
 	// the filesystem.
@@ -235,11 +231,7 @@ func TestSharedVolumeFile(t *testing.T) {
 	}
 
 	// Append to file inside the container and check that content is not lost.
-	argsAppend := &control.ExecArgs{
-		Filename: "/bin/bash",
-		Argv:     []string{"bash", "-c", "echo -n sandbox- >> " + filename},
-	}
-	if _, err := c.executeSync(argsAppend); err != nil {
+	if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil {
 		t.Fatalf("unexpected error appending file %q: %v", filename, err)
 	}
 	want = []byte("host-sandbox-")
diff --git a/runsc/debian/description b/runsc/debian/description
deleted file mode 100644
index 9e8e08805..000000000
--- a/runsc/debian/description
+++ /dev/null
@@ -1 +0,0 @@
-gVisor container sandbox runtime
diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh
deleted file mode 100755
index dc7aeee87..000000000
--- a/runsc/debian/postinst.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh -e
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ "$1" != configure ]; then
-  exit 0
-fi
-
-if [ -f /etc/docker/daemon.json ]; then
-  runsc install
-  systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2
-fi
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
index 0ca4829d7..775325c06 100644
--- a/runsc/flag/flag.go
+++ b/runsc/flag/flag.go
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Package flag wraps flag primitives.
 package flag
 
 import (
@@ -21,13 +22,19 @@ import (
 type FlagSet = flag.FlagSet
 
 var (
-	NewFlagSet  = flag.NewFlagSet
-	String      = flag.String
 	Bool        = flag.Bool
-	Int         = flag.Int
-	Uint        = flag.Uint
 	CommandLine = flag.CommandLine
+	Int         = flag.Int
+	NewFlagSet  = flag.NewFlagSet
 	Parse       = flag.Parse
+	String      = flag.String
+	Uint        = flag.Uint
+	Var         = flag.Var
 )
 
 const ContinueOnError = flag.ContinueOnError
+
+// Get returns the flag's underlying object.
+func Get(v flag.Value) interface{} {
+	return v.(flag.Getter).Get()
+}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 1036b0630..96c57a426 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -31,5 +31,7 @@ go_test(
     deps = [
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/test/testutil",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 1dce36965..39b8a0b1e 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -27,62 +27,51 @@ import (
 var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_ACCEPT:        {},
 	syscall.SYS_CLOCK_GETTIME: {},
-	syscall.SYS_CLONE: []seccomp.Rule{
-		{
-			seccomp.AllowValue(
-				syscall.CLONE_VM |
-					syscall.CLONE_FS |
-					syscall.CLONE_FILES |
-					syscall.CLONE_SIGHAND |
-					syscall.CLONE_SYSVSEM |
-					syscall.CLONE_THREAD),
-		},
-	},
-	syscall.SYS_CLOSE:     {},
-	syscall.SYS_DUP:       {},
-	syscall.SYS_EPOLL_CTL: {},
+	syscall.SYS_CLOSE:         {},
+	syscall.SYS_DUP:           {},
+	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EVENTFD2: []seccomp.Rule{
 		{
-			seccomp.AllowValue(0),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(0),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EXIT:       {},
 	syscall.SYS_EXIT_GROUP: {},
 	syscall.SYS_FALLOCATE: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_FCHMOD:   {},
 	syscall.SYS_FCHOWNAT: {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_SETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_SETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFD),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFD),
 		},
 		// Used by flipcall.PacketWindowAllocator.Init().
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(unix.F_ADD_SEALS),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(unix.F_ADD_SEALS),
 		},
 	},
 	syscall.SYS_FSTAT:     {},
@@ -91,31 +80,31 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FTRUNCATE: {},
 	syscall.SYS_FUTEX: {
 		seccomp.Rule{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 		seccomp.Rule{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 		// Non-private futex used for flipcall.
 		seccomp.Rule{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		seccomp.Rule{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 	},
 	syscall.SYS_GETDENTS64:   {},
@@ -128,6 +117,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_MADVISE:      {},
 	unix.SYS_MEMFD_CREATE:    {}, /// Used by flipcall.PacketWindowAllocator.Init().
 	syscall.SYS_MKDIRAT:      {},
+	syscall.SYS_MKNODAT:      {},
 	// Used by the Go runtime as a temporarily workaround for a Linux
 	// 5.2-5.4 bug.
 	//
@@ -136,28 +126,28 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// TODO(b/148688965): Remove once this is gone from Go.
 	syscall.SYS_MLOCK: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(4096),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(4096),
 		},
 	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_SHARED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_SHARED),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
 		},
 	},
 	syscall.SYS_MPROTECT:   {},
@@ -171,14 +161,14 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_READLINKAT: {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
 		},
 	},
 	syscall.SYS_RENAMEAT:        {},
@@ -189,33 +179,33 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SENDMSG: []seccomp.Rule{
 		// Used by fdchannel.Endpoint.SendFD().
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 		// Used by unet.SocketWriter.WriteVec().
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
 		},
 	},
 	syscall.SYS_SHUTDOWN: []seccomp.Rule{
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK: {},
 	// Used by fdchannel.NewConnectedSockets().
 	syscall.SYS_SOCKETPAIR: {
 		{
-			seccomp.AllowValue(syscall.AF_UNIX),
-			seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(syscall.AF_UNIX),
+			seccomp.EqualTo(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_SYMLINKAT: {},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
-			seccomp.AllowValue(uint64(os.Getpid())),
+			seccomp.EqualTo(uint64(os.Getpid())),
 		},
 	},
 	syscall.SYS_UNLINKAT:  {},
@@ -226,24 +216,24 @@ var allowedSyscalls = seccomp.SyscallRules{
 var udsSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SOCKET: []seccomp.Rule{
 		{
-			seccomp.AllowValue(syscall.AF_UNIX),
-			seccomp.AllowValue(syscall.SOCK_STREAM),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(syscall.AF_UNIX),
+			seccomp.EqualTo(syscall.SOCK_STREAM),
+			seccomp.EqualTo(0),
 		},
 		{
-			seccomp.AllowValue(syscall.AF_UNIX),
-			seccomp.AllowValue(syscall.SOCK_DGRAM),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(syscall.AF_UNIX),
+			seccomp.EqualTo(syscall.SOCK_DGRAM),
+			seccomp.EqualTo(0),
 		},
 		{
-			seccomp.AllowValue(syscall.AF_UNIX),
-			seccomp.AllowValue(syscall.SOCK_SEQPACKET),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(syscall.AF_UNIX),
+			seccomp.EqualTo(syscall.SOCK_SEQPACKET),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_CONNECT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
 		},
 	},
 }
diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go
index a4b28cb8b..686753d96 100644
--- a/runsc/fsgofer/filter/config_amd64.go
+++ b/runsc/fsgofer/filter/config_amd64.go
@@ -25,8 +25,41 @@ import (
 
 func init() {
 	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
-		{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		{seccomp.AllowValue(linux.ARCH_SET_FS)},
+		// TODO(b/168828518): No longer used in Go 1.16+.
+		{seccomp.EqualTo(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		// parent_tidptr and child_tidptr are always 0 because neither
+		// CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SETTLS |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+		{
+			// TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
 	}
 
 	allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{}
diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go
index d2697deb7..ff0cf77a0 100644
--- a/runsc/fsgofer/filter/config_arm64.go
+++ b/runsc/fsgofer/filter/config_arm64.go
@@ -23,5 +23,26 @@ import (
 )
 
 func init() {
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		// parent_tidptr and child_tidptr are always 0 because neither
+		// CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			// These arguments are left uninitialized by the Go
+			// runtime, so they may be anything (and are unused by
+			// the host).
+			seccomp.MatchAny{}, // parent_tidptr
+			seccomp.MatchAny{}, // tls
+			seccomp.MatchAny{}, // child_tidptr
+		},
+	}
+
 	allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{}
 }
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
index 885c92f7a..20a0732be 100644
--- a/runsc/fsgofer/filter/extra_filters_race.go
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -35,6 +35,7 @@ func instrumentationFilters() seccomp.SyscallRules {
 		syscall.SYS_MUNLOCK:         {},
 		syscall.SYS_NANOSLEEP:       {},
 		syscall.SYS_OPEN:            {},
+		syscall.SYS_OPENAT:          {},
 		syscall.SYS_SET_ROBUST_LIST: {},
 		// Used within glibc's malloc.
 		syscall.SYS_TIME: {},
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index edc239013..0b628c8ce 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -29,7 +29,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
-	"syscall"
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -45,39 +44,11 @@ const (
 	// modes to ensure an unopened/closed file fails all mode checks.
 	invalidMode = p9.OpenFlags(math.MaxUint32)
 
-	openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
-)
-
-type fileType int
+	openFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC
 
-const (
-	regular fileType = iota
-	directory
-	symlink
-	socket
-	unknown
+	allowedOpenFlags = unix.O_TRUNC
 )
 
-// String implements fmt.Stringer.
-func (f fileType) String() string {
-	switch f {
-	case regular:
-		return "regular"
-	case directory:
-		return "directory"
-	case symlink:
-		return "symlink"
-	case socket:
-		return "socket"
-	}
-	return "unknown"
-}
-
-// ControlSocketAddr generates an abstract unix socket name for the given id.
-func ControlSocketAddr(id string) string {
-	return fmt.Sprintf("\x00runsc-gofer.%s", id)
-}
-
 // Config sets configuration options for each attach point.
 type Config struct {
 	// ROMount is set to true if this is a readonly mount.
@@ -132,19 +103,19 @@ func (a *attachPoint) Attach() (p9.File, error) {
 		return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
 	}
 
-	f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
+	f, readable, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
 		return fd.Open(a.prefix, openFlags|mode, 0)
 	})
 	if err != nil {
 		return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err)
 	}
 
-	stat, err := stat(f.FD())
+	stat, err := fstat(f.FD())
 	if err != nil {
 		return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
 	}
 
-	lf, err := newLocalFile(a, f, a.prefix, stat)
+	lf, err := newLocalFile(a, f, a.prefix, readable, stat)
 	if err != nil {
 		return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err)
 	}
@@ -153,7 +124,7 @@ func (a *attachPoint) Attach() (p9.File, error) {
 }
 
 // makeQID returns a unique QID for the given stat buffer.
-func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
+func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID {
 	a.deviceMu.Lock()
 	defer a.deviceMu.Unlock()
 
@@ -175,8 +146,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 		log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino)
 	}
 	ino := uint64(dev)<<56 | maskedIno
-	log.Debugf("host inode %x on device %x mapped to virtual inode %x", stat.Ino, stat.Dev, ino)
-
 	return p9.QID{
 		Type: p9.FileMode(stat.Mode).QIDType(),
 		Path: ino,
@@ -186,9 +155,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // localFile implements p9.File wrapping a local file. The underlying file
 // is opened during Walk() and stored in 'file' to be used with other
 // operations. The file is opened as readonly, unless it's a symlink or there is
-// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is
-// called to clone the file. This reduces the number of walks that need to be
-// done by the host file system when files are reused.
+// no read access, which requires O_PATH.
 //
 // The file may be reopened if the requested mode in Open() is not a subset of
 // current mode. Consequently, 'file' could have a mode wider than requested and
@@ -200,13 +167,30 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
 // multiple files are only being opened for read (esp. startup).
+//
+// File operations must use "at" functions whenever possible:
+//   * Local operations must use AT_EMPTY_PATH:
+//  	   fchownat(fd, "", AT_EMPTY_PATH, ...), instead of chown(fullpath, ...)
+//   * Creation operations must use (fd + name):
+//       mkdirat(fd, name, ...), instead of mkdir(fullpath, ...)
+//
+// Apart from being faster, it also adds another layer of defense against
+// symlink attacks (note that O_NOFOLLOW applies only to the last element in
+// the path).
+//
+// The few exceptions where it cannot be done are: utimensat on symlinks, and
+// Connect() for the socket address.
 type localFile struct {
-	p9.DefaultWalkGetAttr
+	p9.DisallowClientCalls
 
 	// attachPoint is the attachPoint that serves this localFile.
 	attachPoint *attachPoint
 
-	// hostPath will be safely updated by the Renamed hook.
+	// hostPath is the full path to the host file. It can be used for logging and
+	// the few cases where full path is required to operation the host file. In
+	// all other cases, use "file" directly.
+	//
+	// Note: it's safely updated by the Renamed hook.
 	hostPath string
 
 	// file is opened when localFile is created and it's never nil. It may be
@@ -214,12 +198,19 @@ type localFile struct {
 	// opened with.
 	file *fd.FD
 
+	// controlReadable tells whether 'file' was opened with read permissions
+	// during a walk.
+	controlReadable bool
+
 	// mode is the mode in which the file was opened. Set to invalidMode
 	// if localFile isn't opened.
 	mode p9.OpenFlags
 
-	// ft is the fileType for this file.
-	ft fileType
+	// fileType for this file. It is equivalent to:
+	// unix.Stat_t.Mode & unix.S_IFMT
+	fileType uint32
+
+	qid p9.QID
 
 	// readDirMu protects against concurrent Readdir calls.
 	readDirMu sync.Mutex
@@ -236,7 +227,7 @@ var procSelfFD *fd.FD
 // OpenProcSelfFD opens the /proc/self/fd directory, which will be used to
 // reopen file descriptors.
 func OpenProcSelfFD() error {
-	d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0)
+	d, err := unix.Open("/proc/self/fd", unix.O_RDONLY|unix.O_DIRECTORY, 0)
 	if err != nil {
 		return fmt.Errorf("error opening /proc/self/fd: %v", err)
 	}
@@ -245,7 +236,7 @@ func OpenProcSelfFD() error {
 }
 
 func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
-	d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0)
+	d, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^unix.O_NOFOLLOW, 0)
 	if err != nil {
 		return nil, err
 	}
@@ -253,83 +244,88 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
 	return fd.New(d), nil
 }
 
-func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) {
-	path := path.Join(parent.hostPath, name)
-	f, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) {
+	pathDebug := path.Join(parent.hostPath, name)
+	f, readable, err := openAnyFile(pathDebug, func(mode int) (*fd.FD, error) {
 		return fd.OpenAt(parent.file, name, openFlags|mode, 0)
 	})
-	return f, path, err
+	return f, pathDebug, readable, err
 }
 
-// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
+// openAnyFile attempts to open the file in O_RDONLY. If it fails, falls back
 // to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
 // actual file open and is customizable by the caller.
-func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
+func openAnyFile(pathDebug string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) {
 	// Attempt to open file in the following mode in order:
 	//   1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
 	//      Use non-blocking to prevent getting stuck inside open(2) for
 	//      FIFOs. This option has no effect on regular files.
 	//   2. PATH: for symlinks, sockets.
-	modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
+	options := []struct {
+		mode     int
+		readable bool
+	}{
+		{
+			mode:     unix.O_RDONLY | unix.O_NONBLOCK,
+			readable: true,
+		},
+		{
+			mode:     unix.O_PATH,
+			readable: false,
+		},
+	}
 
 	var err error
-	var file *fd.FD
-	for i, mode := range modes {
-		file, err = fn(mode)
+	for i, option := range options {
+		var file *fd.FD
+		file, err = fn(option.mode)
 		if err == nil {
-			// openat succeeded, we're done.
-			break
+			// Succeeded opening the file, we're done.
+			return file, option.readable, nil
 		}
 		switch e := extractErrno(err); e {
-		case syscall.ENOENT:
+		case unix.ENOENT:
 			// File doesn't exist, no point in retrying.
-			return nil, e
+			return nil, false, e
 		}
-		// openat failed. Try again with next mode, preserving 'err' in case this
-		// was the last attempt.
-		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err)
+		// File failed to open. Try again with next mode, preserving 'err' in case
+		// this was the last attempt.
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, pathDebug, err)
 	}
-	if err != nil {
-		// All attempts to open file have failed, return the last error.
-		log.Debugf("Failed to open file, path: %q, err: %v", path, err)
-		return nil, extractErrno(err)
-	}
-
-	return file, nil
+	// All attempts to open file have failed, return the last error.
+	log.Debugf("Failed to open file, path: %q, err: %v", pathDebug, err)
+	return nil, false, extractErrno(err)
 }
 
-func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) {
-	var ft fileType
-	switch stat.Mode & syscall.S_IFMT {
-	case syscall.S_IFREG:
-		ft = regular
-	case syscall.S_IFDIR:
-		ft = directory
-	case syscall.S_IFLNK:
-		ft = symlink
-	case syscall.S_IFSOCK:
+func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error {
+	switch stat.Mode & unix.S_IFMT {
+	case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK:
+		return nil
+
+	case unix.S_IFSOCK:
 		if !permitSocket {
-			return unknown, syscall.EPERM
+			return unix.EPERM
 		}
-		ft = socket
+		return nil
+
 	default:
-		return unknown, syscall.EPERM
+		return unix.EPERM
 	}
-	return ft, nil
 }
 
-func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
-	ft, err := getSupportedFileType(stat, a.conf.HostUDS)
-	if err != nil {
+func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat unix.Stat_t) (*localFile, error) {
+	if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil {
 		return nil, err
 	}
 
 	return &localFile{
-		attachPoint: a,
-		hostPath:    path,
-		file:        file,
-		mode:        invalidMode,
-		ft:          ft,
+		attachPoint:     a,
+		hostPath:        path,
+		file:            file,
+		mode:            invalidMode,
+		fileType:        stat.Mode & unix.S_IFMT,
+		qid:             a.makeQID(stat),
+		controlReadable: readable,
 	}, nil
 }
 
@@ -337,7 +333,7 @@ func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t)
 // non-blocking. If anything fails, returns nil. It's better to have a file
 // without host FD, than to fail the operation.
 func newFDMaybe(file *fd.FD) *fd.FD {
-	dupFD, err := syscall.Dup(file.FD())
+	dupFD, err := unix.Dup(file.FD())
 	// Technically, the runtime may call the finalizer on file as soon as
 	// FD() returns.
 	runtime.KeepAlive(file)
@@ -347,23 +343,23 @@ func newFDMaybe(file *fd.FD) *fd.FD {
 	dup := fd.New(dupFD)
 
 	// fd is blocking; non-blocking is required.
-	if err := syscall.SetNonblock(dup.FD(), true); err != nil {
-		dup.Close()
+	if err := unix.SetNonblock(dup.FD(), true); err != nil {
+		_ = dup.Close()
 		return nil
 	}
 	return dup
 }
 
-func stat(fd int) (syscall.Stat_t, error) {
-	var stat syscall.Stat_t
-	if err := syscall.Fstat(fd, &stat); err != nil {
-		return syscall.Stat_t{}, err
+func fstat(fd int) (unix.Stat_t, error) {
+	var stat unix.Stat_t
+	if err := unix.Fstat(fd, &stat); err != nil {
+		return unix.Stat_t{}, err
 	}
 	return stat, nil
 }
 
 func fchown(fd int, uid p9.UID, gid p9.GID) error {
-	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+	return unix.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
 }
 
 // Open implements p9.File.
@@ -371,10 +367,16 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	if l.isOpen() {
 		panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
 	}
+	mode := flags & p9.OpenFlagsModeMask
+	if mode == p9.WriteOnly || mode == p9.ReadWrite || flags&p9.OpenTruncate != 0 {
+		if err := l.checkROMount(); err != nil {
+			return nil, p9.QID{}, 0, err
+		}
+	}
 
 	// Check if control file can be used or if a new open must be created.
 	var newFile *fd.FD
-	if flags == p9.ReadOnly {
+	if mode == p9.ReadOnly && l.controlReadable && flags.OSFlags()&allowedOpenFlags == 0 {
 		log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath)
 		newFile = l.file
 	} else {
@@ -383,23 +385,15 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
 		log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath)
 		var err error
-		// Constrain open flags to the open mode and O_TRUNC.
-		newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC)))
+		osFlags := flags.OSFlags() & (unix.O_ACCMODE | allowedOpenFlags)
+		newFile, err = reopenProcFd(l.file, openFlags|osFlags)
 		if err != nil {
 			return nil, p9.QID{}, 0, extractErrno(err)
 		}
 	}
 
-	stat, err := stat(newFile.FD())
-	if err != nil {
-		if newFile != l.file {
-			newFile.Close()
-		}
-		return nil, p9.QID{}, 0, extractErrno(err)
-	}
-
 	var fd *fd.FD
-	if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+	if l.fileType == unix.S_IFREG {
 		// Donate FD for regular files only.
 		fd = newFDMaybe(newFile)
 	}
@@ -411,38 +405,38 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		}
 		l.file = newFile
 	}
-	l.mode = flags & p9.OpenFlagsModeMask
-	return fd, l.attachPoint.makeQID(stat), 0, nil
+	l.mode = mode
+	return fd, l.qid, 0, nil
 }
 
 // Create implements p9.File.
-func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
-	conf := l.attachPoint.conf
-	if conf.ROMount {
-		if conf.PanicOnWrite {
-			panic("attempt to write to RO mount")
-		}
-		return nil, nil, p9.QID{}, 0, syscall.EBADF
+func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
+	if err := l.checkROMount(); err != nil {
+		return nil, nil, p9.QID{}, 0, err
 	}
 
+	// Set file creation flags, plus allowed open flags from caller.
+	osFlags := openFlags | unix.O_CREAT | unix.O_EXCL
+	osFlags |= p9Flags.OSFlags() & allowedOpenFlags
+
 	// 'file' may be used for other operations (e.g. Walk), so read access is
 	// always added to flags. Note that resulting file might have a wider mode
 	// than needed for each particular case.
-	flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+	mode := p9Flags & p9.OpenFlagsModeMask
 	if mode == p9.WriteOnly {
-		flags |= syscall.O_RDWR
+		osFlags |= unix.O_RDWR
 	} else {
-		flags |= mode.OSFlags()
+		osFlags |= mode.OSFlags()
 	}
 
-	child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions()))
+	child, err := fd.OpenAt(l.file, name, osFlags, uint32(perm.Permissions()))
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
 	cu := cleanup.Make(func() {
-		child.Close()
+		_ = child.Close()
 		// Best effort attempt to remove the file in case of failure.
-		if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
+		if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
 		}
 	})
@@ -451,7 +445,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	if err := fchown(child.FD(), uid, gid); err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
-	stat, err := stat(child.FD())
+	stat, err := fstat(child.FD())
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
@@ -461,23 +455,21 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 		hostPath:    path.Join(l.hostPath, name),
 		file:        child,
 		mode:        mode,
+		fileType:    unix.S_IFREG,
+		qid:         l.attachPoint.makeQID(stat),
 	}
 
 	cu.Release()
-	return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
+	return newFDMaybe(c.file), c, c.qid, 0, nil
 }
 
 // Mkdir implements p9.File.
 func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	conf := l.attachPoint.conf
-	if conf.ROMount {
-		if conf.PanicOnWrite {
-			panic("attempt to write to RO mount")
-		}
-		return p9.QID{}, syscall.EBADF
+	if err := l.checkROMount(); err != nil {
+		return p9.QID{}, err
 	}
 
-	if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
+	if err := unix.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
 	cu := cleanup.Make(func() {
@@ -489,7 +481,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	defer cu.Clean()
 
 	// Open directory to change ownership and stat it.
-	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+	flags := unix.O_DIRECTORY | unix.O_RDONLY | openFlags
 	f, err := fd.OpenAt(l.file, name, flags, 0)
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
@@ -499,7 +491,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	if err := fchown(f.FD(), uid, gid); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	stat, err := stat(f.FD())
+	stat, err := fstat(f.FD())
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -510,61 +502,80 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 
 // Walk implements p9.File.
 func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+	qids, file, _, err := l.walk(names)
+	return qids, file, err
+}
+
+// WalkGetAttr implements p9.File.
+func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) {
+	qids, file, stat, err := l.walk(names)
+	if err != nil {
+		return nil, nil, p9.AttrMask{}, p9.Attr{}, err
+	}
+	mask, attr := l.fillAttr(stat)
+	return qids, file, mask, attr, nil
+}
+
+func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) {
 	// Duplicate current file if 'names' is empty.
 	if len(names) == 0 {
-		newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
+		newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
 			return reopenProcFd(l.file, openFlags|mode)
 		})
 		if err != nil {
-			return nil, nil, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
-		stat, err := stat(newFile.FD())
+		stat, err := fstat(newFile.FD())
 		if err != nil {
-			newFile.Close()
-			return nil, nil, extractErrno(err)
+			_ = newFile.Close()
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
 		c := &localFile{
-			attachPoint: l.attachPoint,
-			hostPath:    l.hostPath,
-			file:        newFile,
-			mode:        invalidMode,
+			attachPoint:     l.attachPoint,
+			hostPath:        l.hostPath,
+			file:            newFile,
+			mode:            invalidMode,
+			fileType:        l.fileType,
+			qid:             l.attachPoint.makeQID(stat),
+			controlReadable: readable,
 		}
-		return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
+		return []p9.QID{c.qid}, c, stat, nil
 	}
 
 	var qids []p9.QID
+	var lastStat unix.Stat_t
 	last := l
 	for _, name := range names {
-		f, path, err := openAnyFileFromParent(last, name)
+		f, path, readable, err := openAnyFileFromParent(last, name)
 		if last != l {
-			last.Close()
+			_ = last.Close()
 		}
 		if err != nil {
-			return nil, nil, extractErrno(err)
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
-		stat, err := stat(f.FD())
+		lastStat, err = fstat(f.FD())
 		if err != nil {
-			f.Close()
-			return nil, nil, extractErrno(err)
+			_ = f.Close()
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
-		c, err := newLocalFile(last.attachPoint, f, path, stat)
+		c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat)
 		if err != nil {
-			f.Close()
-			return nil, nil, extractErrno(err)
+			_ = f.Close()
+			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
 
-		qids = append(qids, l.attachPoint.makeQID(stat))
+		qids = append(qids, c.qid)
 		last = c
 	}
-	return qids, last, nil
+	return qids, last, lastStat, nil
 }
 
 // StatFS implements p9.File.
 func (l *localFile) StatFS() (p9.FSStat, error) {
-	var s syscall.Statfs_t
-	if err := syscall.Fstatfs(l.file.FD(), &s); err != nil {
+	var s unix.Statfs_t
+	if err := unix.Fstatfs(l.file.FD(), &s); err != nil {
 		return p9.FSStat{}, extractErrno(err)
 	}
 
@@ -584,9 +595,9 @@ func (l *localFile) StatFS() (p9.FSStat, error) {
 // FSync implements p9.File.
 func (l *localFile) FSync() error {
 	if !l.isOpen() {
-		return syscall.EBADF
+		return unix.EBADF
 	}
-	if err := syscall.Fsync(l.file.FD()); err != nil {
+	if err := unix.Fsync(l.file.FD()); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -594,11 +605,15 @@ func (l *localFile) FSync() error {
 
 // GetAttr implements p9.File.
 func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
-	stat, err := stat(l.file.FD())
+	stat, err := fstat(l.file.FD())
 	if err != nil {
 		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
 	}
+	mask, attr := l.fillAttr(stat)
+	return l.qid, mask, attr, nil
+}
 
+func (l *localFile) fillAttr(stat unix.Stat_t) (p9.AttrMask, p9.Attr) {
 	attr := p9.Attr{
 		Mode:             p9.FileMode(stat.Mode),
 		UID:              p9.UID(stat.Uid),
@@ -627,20 +642,15 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
 		MTime:  true,
 		CTime:  true,
 	}
-
-	return l.attachPoint.makeQID(stat), valid, attr, nil
+	return valid, attr
 }
 
 // SetAttr implements p9.File. Due to mismatch in file API, options
 // cannot be changed atomically and user may see partial changes when
 // an error happens.
 func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
-	conf := l.attachPoint.conf
-	if conf.ROMount {
-		if conf.PanicOnWrite {
-			panic("attempt to write to RO mount")
-		}
-		return syscall.EBADF
+	if err := l.checkROMount(); err != nil {
+		return err
 	}
 
 	allowed := p9.SetAttrMask{
@@ -663,13 +673,13 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// consistent result that is not attribute dependent.
 	if !valid.IsSubsetOf(allowed) {
 		log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid)
-		return syscall.EPERM
+		return unix.EPERM
 	}
 
 	// Check if it's possible to use cached file, or if another one needs to be
 	// opened for write.
 	f := l.file
-	if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+	if l.fileType == unix.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
 		var err error
 		f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY)
 		if err != nil {
@@ -690,21 +700,21 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 	// over another.
 	var err error
 	if valid.Permissions {
-		if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
+		if cerr := unix.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
 			log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
 			err = extractErrno(cerr)
 		}
 	}
 
 	if valid.Size {
-		if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
+		if terr := unix.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
 			log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
 			err = extractErrno(terr)
 		}
 	}
 
 	if valid.ATime || valid.MTime {
-		utimes := [2]syscall.Timespec{
+		utimes := [2]unix.Timespec{
 			{Sec: 0, Nsec: linux.UTIME_OMIT},
 			{Sec: 0, Nsec: linux.UTIME_OMIT},
 		}
@@ -725,15 +735,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 			}
 		}
 
-		if l.ft == symlink {
+		if l.fileType == unix.S_IFLNK {
 			// utimensat operates different that other syscalls. To operate on a
 			// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
 			// name.
-			parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+			parent, err := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
 			if err != nil {
 				return extractErrno(err)
 			}
-			defer syscall.Close(parent)
+			defer unix.Close(parent)
 
 			if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
 				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
@@ -758,7 +768,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 		if valid.GID {
 			gid = int(attr.GID)
 		}
-		if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+		if oerr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
 			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
 			err = extractErrno(oerr)
 		}
@@ -768,28 +778,28 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 }
 
 func (*localFile) GetXattr(string, uint64) (string, error) {
-	return "", syscall.EOPNOTSUPP
+	return "", unix.EOPNOTSUPP
 }
 
 func (*localFile) SetXattr(string, string, uint32) error {
-	return syscall.EOPNOTSUPP
+	return unix.EOPNOTSUPP
 }
 
 func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
-	return nil, syscall.EOPNOTSUPP
+	return nil, unix.EOPNOTSUPP
 }
 
 func (*localFile) RemoveXattr(string) error {
-	return syscall.EOPNOTSUPP
+	return unix.EOPNOTSUPP
 }
 
 // Allocate implements p9.File.
 func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
 	if !l.isOpen() {
-		return syscall.EBADF
+		return unix.EBADF
 	}
 
-	if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+	if err := unix.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
 		return extractErrno(err)
 	}
 	return nil
@@ -802,12 +812,8 @@ func (*localFile) Rename(p9.File, string) error {
 
 // RenameAt implements p9.File.RenameAt.
 func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error {
-	conf := l.attachPoint.conf
-	if conf.ROMount {
-		if conf.PanicOnWrite {
-			panic("attempt to write to RO mount")
-		}
-		return syscall.EBADF
+	if err := l.checkROMount(); err != nil {
+		return err
 	}
 
 	newParent := directory.(*localFile)
@@ -820,10 +826,10 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
 // ReadAt implements p9.File.
 func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 	if !l.isOpen() {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 
 	r, err := l.file.ReadAt(p, int64(offset))
@@ -838,10 +844,10 @@ func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
 // WriteAt implements p9.File.
 func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 	if !l.isOpen() {
-		return 0, syscall.EBADF
+		return 0, unix.EBADF
 	}
 
 	w, err := l.file.WriteAt(p, int64(offset))
@@ -853,12 +859,8 @@ func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
 
 // Symlink implements p9.File.
 func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
-	conf := l.attachPoint.conf
-	if conf.ROMount {
-		if conf.PanicOnWrite {
-			panic("attempt to write to RO mount")
-		}
-		return p9.QID{}, syscall.EBADF
+	if err := l.checkROMount(); err != nil {
+		return p9.QID{}, err
 	}
 
 	if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
@@ -866,7 +868,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	}
 	cu := cleanup.Make(func() {
 		// Best effort attempt to remove the symlink in case of failure.
-		if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
+		if err := unix.Unlinkat(l.file.FD(), newName, 0); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
 		}
 	})
@@ -882,7 +884,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	if err := fchown(f.FD(), uid, gid); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	stat, err := stat(f.FD())
+	stat, err := fstat(f.FD())
 	if err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
@@ -893,12 +895,8 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 
 // Link implements p9.File.
 func (l *localFile) Link(target p9.File, newName string) error {
-	conf := l.attachPoint.conf
-	if conf.ROMount {
-		if conf.PanicOnWrite {
-			panic("attempt to write to RO mount")
-		}
-		return syscall.EBADF
+	if err := l.checkROMount(); err != nil {
+		return err
 	}
 
 	targetFile := target.(*localFile)
@@ -909,23 +907,53 @@ func (l *localFile) Link(target p9.File, newName string) error {
 }
 
 // Mknod implements p9.File.
-//
-// Not implemented.
-func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	if err := l.checkROMount(); err != nil {
+		return p9.QID{}, err
+	}
+
 	// From mknod(2) man page:
 	// "EPERM: [...] if the filesystem containing pathname does not support
 	// the type of node requested."
-	return p9.QID{}, syscall.EPERM
+	if mode.FileType() != p9.ModeRegular {
+		return p9.QID{}, unix.EPERM
+	}
+
+	// Allow Mknod to create regular files.
+	if err := unix.Mknodat(l.file.FD(), name, uint32(mode), 0); err != nil {
+		return p9.QID{}, err
+	}
+	cu := cleanup.Make(func() {
+		// Best effort attempt to remove the file in case of failure.
+		if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil {
+			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
+		}
+	})
+	defer cu.Clean()
+
+	// Open file to change ownership and stat it.
+	child, err := fd.OpenAt(l.file, name, unix.O_PATH|openFlags, 0)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	defer child.Close()
+
+	if err := fchown(child.FD(), uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := fstat(child.FD())
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+
+	cu.Release()
+	return l.attachPoint.makeQID(stat), nil
 }
 
 // UnlinkAt implements p9.File.
 func (l *localFile) UnlinkAt(name string, flags uint32) error {
-	conf := l.attachPoint.conf
-	if conf.ROMount {
-		if conf.PanicOnWrite {
-			panic("attempt to write to RO mount")
-		}
-		return syscall.EBADF
+	if err := l.checkROMount(); err != nil {
+		return err
 	}
 
 	if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil {
@@ -937,10 +965,10 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
 // Readdir implements p9.File.
 func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
-		return nil, syscall.EBADF
+		return nil, unix.EBADF
 	}
 	if !l.isOpen() {
-		return nil, syscall.EBADF
+		return nil, unix.EBADF
 	}
 
 	// Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
@@ -951,10 +979,13 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 
 	skip := uint64(0)
 
-	// Check if the file is at the correct position already. If not, seek to the
-	// beginning and read the entire directory again.
-	if l.lastDirentOffset != offset {
-		if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
+	// Check if the file is at the correct position already. If not, seek to
+	// the beginning and read the entire directory again. We always seek if
+	// offset is 0, since this is side-effectual (equivalent to rewinddir(3),
+	// which causes the directory stream to resynchronize with the directory's
+	// current contents).
+	if l.lastDirentOffset != offset || offset == 0 {
+		if _, err := unix.Seek(l.file.FD(), 0, 0); err != nil {
 			return nil, extractErrno(err)
 		}
 		skip = offset
@@ -987,7 +1018,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
 
 	end := offset + uint64(count)
 	for offset < end {
-		dirSize, err := syscall.ReadDirent(f, direntsBuf)
+		dirSize, err := unix.ReadDirent(f, direntsBuf)
 		if err != nil {
 			return dirents, err
 		}
@@ -996,7 +1027,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
 		}
 
 		names := names[:0]
-		_, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names)
+		_, _, names = unix.ParseDirent(direntsBuf[:dirSize], -1, names)
 
 		// Skip over entries that the caller is not interested in.
 		if skip > 0 {
@@ -1041,7 +1072,7 @@ func (l *localFile) Readlink() (string, error) {
 			return string(b[:n]), nil
 		}
 	}
-	return "", syscall.ENOMEM
+	return "", unix.ENOMEM
 }
 
 // Flush implements p9.File.
@@ -1052,7 +1083,7 @@ func (l *localFile) Flush() error {
 // Connect implements p9.File.
 func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
 	if !l.attachPoint.conf.HostUDS {
-		return nil, syscall.ECONNREFUSED
+		return nil, unix.ECONNREFUSED
 	}
 
 	// TODO(gvisor.dev/issue/1003): Due to different app vs replacement
@@ -1060,34 +1091,34 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
 	// fit f.path in our sockaddr. We'd need to redirect through a shorter
 	// path in order to actually connect to this socket.
 	if len(l.hostPath) > linux.UnixPathMax {
-		return nil, syscall.ECONNREFUSED
+		return nil, unix.ECONNREFUSED
 	}
 
 	var stype int
 	switch flags {
 	case p9.StreamSocket:
-		stype = syscall.SOCK_STREAM
+		stype = unix.SOCK_STREAM
 	case p9.DgramSocket:
-		stype = syscall.SOCK_DGRAM
+		stype = unix.SOCK_DGRAM
 	case p9.SeqpacketSocket:
-		stype = syscall.SOCK_SEQPACKET
+		stype = unix.SOCK_SEQPACKET
 	default:
-		return nil, syscall.ENXIO
+		return nil, unix.ENXIO
 	}
 
-	f, err := syscall.Socket(syscall.AF_UNIX, stype, 0)
+	f, err := unix.Socket(unix.AF_UNIX, stype, 0)
 	if err != nil {
 		return nil, err
 	}
 
-	if err := syscall.SetNonblock(f, true); err != nil {
-		syscall.Close(f)
+	if err := unix.SetNonblock(f, true); err != nil {
+		_ = unix.Close(f)
 		return nil, err
 	}
 
-	sa := syscall.SockaddrUnix{Name: l.hostPath}
-	if err := syscall.Connect(f, &sa); err != nil {
-		syscall.Close(f)
+	sa := unix.SockaddrUnix{Name: l.hostPath}
+	if err := unix.Connect(f, &sa); err != nil {
+		_ = unix.Close(f)
 		return nil, err
 	}
 
@@ -1112,7 +1143,7 @@ func (l *localFile) Renamed(newDir p9.File, newName string) {
 }
 
 // extractErrno tries to determine the errno.
-func extractErrno(err error) syscall.Errno {
+func extractErrno(err error) unix.Errno {
 	if err == nil {
 		// This should never happen. The likely result will be that
 		// some user gets the frustrating "error: SUCCESS" message.
@@ -1122,18 +1153,18 @@ func extractErrno(err error) syscall.Errno {
 
 	switch err {
 	case os.ErrNotExist:
-		return syscall.ENOENT
+		return unix.ENOENT
 	case os.ErrExist:
-		return syscall.EEXIST
+		return unix.EEXIST
 	case os.ErrPermission:
-		return syscall.EACCES
+		return unix.EACCES
 	case os.ErrInvalid:
-		return syscall.EINVAL
+		return unix.EINVAL
 	}
 
 	// See if it's an errno or a common wrapped error.
 	switch e := err.(type) {
-	case syscall.Errno:
+	case unix.Errno:
 		return e
 	case *os.PathError:
 		return extractErrno(e.Err)
@@ -1145,5 +1176,12 @@ func extractErrno(err error) syscall.Errno {
 
 	// Fall back to EIO.
 	log.Debugf("Unknown error: %v, defaulting to EIO", err)
-	return syscall.EIO
+	return unix.EIO
+}
+
+func (l *localFile) checkROMount() error {
+	if conf := l.attachPoint.conf; conf.ROMount {
+		return unix.EROFS
+	}
+	return nil
 }
diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go
index 5d4aab597..c46958185 100644
--- a/runsc/fsgofer/fsgofer_amd64_unsafe.go
+++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go
@@ -17,25 +17,25 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
-	nameBytes, err := syscall.BytePtrFromString(name)
+func statAt(dirFd int, name string) (unix.Stat_t, error) {
+	nameBytes, err := unix.BytePtrFromString(name)
 	if err != nil {
-		return syscall.Stat_t{}, err
+		return unix.Stat_t{}, err
 	}
 	namePtr := unsafe.Pointer(nameBytes)
 
-	var stat syscall.Stat_t
+	var stat unix.Stat_t
 	statPtr := unsafe.Pointer(&stat)
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_NEWFSTATAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_NEWFSTATAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(statPtr),
@@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
 		0,
 		0); errno != 0 {
 
-		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+		return unix.Stat_t{}, syserr.FromHost(errno).ToError()
 	}
 	return stat, nil
 }
diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go
index 8041fd352..491460718 100644
--- a/runsc/fsgofer/fsgofer_arm64_unsafe.go
+++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go
@@ -17,25 +17,25 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
-	nameBytes, err := syscall.BytePtrFromString(name)
+func statAt(dirFd int, name string) (unix.Stat_t, error) {
+	nameBytes, err := unix.BytePtrFromString(name)
 	if err != nil {
-		return syscall.Stat_t{}, err
+		return unix.Stat_t{}, err
 	}
 	namePtr := unsafe.Pointer(nameBytes)
 
-	var stat syscall.Stat_t
+	var stat unix.Stat_t
 	statPtr := unsafe.Pointer(&stat)
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_FSTATAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_FSTATAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(statPtr),
@@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
 		0,
 		0); errno != 0 {
 
-		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+		return unix.Stat_t{}, syserr.FromHost(errno).ToError()
 	}
 	return stat, nil
 }
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 05af7e397..a84206686 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -21,11 +21,24 @@ import (
 	"os"
 	"path"
 	"path/filepath"
-	"syscall"
 	"testing"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
+
+var (
+	allTypes = []uint32{unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK}
+
+	// allConfs is set in init().
+	allConfs []Config
+
+	rwConfs = []Config{{ROMount: false}}
+	roConfs = []Config{{ROMount: true}}
 )
 
 func init() {
@@ -39,6 +52,13 @@ func init() {
 	}
 }
 
+func configTestName(conf *Config) string {
+	if conf.ROMount {
+		return "ROMount"
+	}
+	return "RWMount"
+}
+
 func assertPanic(t *testing.T, f func()) {
 	defer func() {
 		if r := recover(); r == nil {
@@ -63,7 +83,7 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
 		}
 		want = append(want, b...)
 	} else {
-		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+		if e, ok := err.(unix.Errno); !ok || e != unix.EBADF {
 			return fmt.Errorf("WriteAt() should have failed, got: %d, want: EBADFD", err)
 		}
 	}
@@ -81,78 +101,83 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
 			return fmt.Errorf("ReadAt() wrong data, got: %s, want: %s", string(rBuf), want)
 		}
 	} else {
-		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+		if e, ok := err.(unix.Errno); !ok || e != unix.EBADF {
 			return fmt.Errorf("ReadAt() should have failed, got: %d, want: EBADFD", err)
 		}
 	}
 	return nil
 }
 
-var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
-
-var (
-	allTypes = []fileType{regular, directory, symlink}
-
-	// allConfs is set in init() above.
-	allConfs []Config
-
-	rwConfs = []Config{{ROMount: false}}
-	roConfs = []Config{{ROMount: true}}
-)
-
 type state struct {
-	root *localFile
-	file *localFile
-	conf Config
-	ft   fileType
+	root     *localFile
+	file     *localFile
+	conf     Config
+	fileType uint32
 }
 
 func (s state) String() string {
-	return fmt.Sprintf("type(%v)", s.ft)
+	return fmt.Sprintf("type(%v)", s.fileType)
+}
+
+func typeName(fileType uint32) string {
+	switch fileType {
+	case unix.S_IFREG:
+		return "file"
+	case unix.S_IFDIR:
+		return "directory"
+	case unix.S_IFLNK:
+		return "symlink"
+	default:
+		panic(fmt.Sprintf("invalid file type for test: %d", fileType))
+	}
 }
 
 func runAll(t *testing.T, test func(*testing.T, state)) {
 	runCustom(t, allTypes, allConfs, test)
 }
 
-func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) {
+func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, state)) {
 	for _, c := range confs {
-		t.Logf("Config: %+v", c)
-
 		for _, ft := range types {
-			t.Logf("File type: %v", ft)
+			name := fmt.Sprintf("%s/%s", configTestName(&c), typeName(ft))
+			t.Run(name, func(t *testing.T) {
+				path, name, err := setup(ft)
+				if err != nil {
+					t.Fatalf("%v", err)
+				}
+				defer os.RemoveAll(path)
 
-			path, name, err := setup(ft)
-			if err != nil {
-				t.Fatalf("%v", err)
-			}
-			defer os.RemoveAll(path)
+				a, err := NewAttachPoint(path, c)
+				if err != nil {
+					t.Fatalf("NewAttachPoint failed: %v", err)
+				}
+				root, err := a.Attach()
+				if err != nil {
+					t.Fatalf("Attach failed, err: %v", err)
+				}
 
-			a, err := NewAttachPoint(path, c)
-			if err != nil {
-				t.Fatalf("NewAttachPoint failed: %v", err)
-			}
-			root, err := a.Attach()
-			if err != nil {
-				t.Fatalf("Attach failed, err: %v", err)
-			}
+				_, file, err := root.Walk([]string{name})
+				if err != nil {
+					root.Close()
+					t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
+				}
 
-			_, file, err := root.Walk([]string{name})
-			if err != nil {
+				st := state{
+					root:     root.(*localFile),
+					file:     file.(*localFile),
+					conf:     c,
+					fileType: ft,
+				}
+				test(t, st)
+				file.Close()
 				root.Close()
-				t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
-			}
-
-			st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft}
-			test(t, st)
-			file.Close()
-			root.Close()
+			})
 		}
 	}
 }
 
-func setup(ft fileType) (string, string, error) {
-	path, err := ioutil.TempDir("", "root-")
+func setup(fileType uint32) (string, string, error) {
+	path, err := ioutil.TempDir(testutil.TmpDir(), "root-")
 	if err != nil {
 		return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err)
 	}
@@ -169,26 +194,26 @@ func setup(ft fileType) (string, string, error) {
 	defer root.Close()
 
 	var name string
-	switch ft {
-	case regular:
+	switch fileType {
+	case unix.S_IFREG:
 		name = "file"
 		_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
 		if err != nil {
 			return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
 		}
 		defer f.Close()
-	case directory:
+	case unix.S_IFDIR:
 		name = "dir"
 		if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
 		}
-	case symlink:
+	case unix.S_IFLNK:
 		name = "symlink"
 		if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
 		}
 	default:
-		panic(fmt.Sprintf("unknown file type %v", ft))
+		panic(fmt.Sprintf("unknown file type %v", fileType))
 	}
 	return path, name, nil
 }
@@ -202,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) {
 }
 
 func TestReadWrite(t *testing.T) {
-	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		child, err := createFile(s.file, "test")
 		if err != nil {
 			t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -221,9 +246,13 @@ func TestReadWrite(t *testing.T) {
 			if err != nil {
 				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
 			}
-			if _, _, _, err := l.Open(flags); err != nil {
+			fd, _, _, err := l.Open(flags)
+			if err != nil {
 				t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err)
 			}
+			if fd != nil {
+				defer fd.Close()
+			}
 			if err := testReadWrite(l, flags, want); err != nil {
 				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
 			}
@@ -232,14 +261,14 @@ func TestReadWrite(t *testing.T) {
 }
 
 func TestCreate(t *testing.T) {
-	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		for i, flags := range allOpenFlags {
 			_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
 			if err != nil {
 				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, flags, err)
 			}
 
-			if err := testReadWrite(l, flags, []byte{}); err != nil {
+			if err := testReadWrite(l, flags, nil); err != nil {
 				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
 			}
 		}
@@ -249,7 +278,7 @@ func TestCreate(t *testing.T) {
 // TestReadWriteDup tests that a file opened in any mode can be dup'ed and
 // reopened in any other mode.
 func TestReadWriteDup(t *testing.T) {
-	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		child, err := createFile(s.file, "test")
 		if err != nil {
 			t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -279,9 +308,13 @@ func TestReadWriteDup(t *testing.T) {
 					t.Fatalf("%v: Walk(<empty>) failed: %v", s, err)
 				}
 				defer dup.Close()
-				if _, _, _, err := dup.Open(dupFlags); err != nil {
+				fd, _, _, err := dup.Open(dupFlags)
+				if err != nil {
 					t.Fatalf("%v: Open(%v) failed: %v", s, flags, err)
 				}
+				if fd != nil {
+					defer fd.Close()
+				}
 				if err := testReadWrite(dup, dupFlags, want); err != nil {
 					t.Fatalf("%v: testReadWrite(%v) failed: %v", s, dupFlags, err)
 				}
@@ -291,19 +324,45 @@ func TestReadWriteDup(t *testing.T) {
 }
 
 func TestUnopened(t *testing.T) {
-	runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s state) {
 		b := []byte("foobar")
-		if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
-			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.WriteAt(b, 0); err != unix.EBADF {
+			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF {
-			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.ReadAt(b, 0); err != unix.EBADF {
+			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if _, err := s.file.Readdir(0, 100); err != syscall.EBADF {
-			t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.Readdir(0, 100); err != unix.EBADF {
+			t.Errorf("%v: Readdir() should have failed, got: %v, expected: unix.EBADF", s, err)
 		}
-		if err := s.file.FSync(); err != syscall.EBADF {
-			t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if err := s.file.FSync(); err != unix.EBADF {
+			t.Errorf("%v: FSync() should have failed, got: %v, expected: unix.EBADF", s, err)
+		}
+	})
+}
+
+// TestOpenOPath is a regression test to ensure that a file that cannot be open
+// for read is allowed to be open. This was happening because the control file
+// was open with O_PATH, but Open() was not checking for it and allowing the
+// control file to be reused.
+func TestOpenOPath(t *testing.T) {
+	runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) {
+		// Fist remove all permissions on the file.
+		if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil {
+			t.Fatalf("SetAttr(): %v", err)
+		}
+		// Then walk to the file again to open a new control file.
+		filename := filepath.Base(s.file.hostPath)
+		_, newFile, err := s.root.Walk([]string{filename})
+		if err != nil {
+			t.Fatalf("root.Walk(%q): %v", filename, err)
+		}
+
+		if newFile.(*localFile).controlReadable {
+			t.Fatalf("control file didn't open with O_PATH: %+v", newFile)
+		}
+		if _, _, _, err := newFile.Open(p9.ReadOnly); err != unix.EACCES {
+			t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err)
 		}
 	})
 }
@@ -324,7 +383,7 @@ func TestSetAttrPerm(t *testing.T) {
 		valid := p9.SetAttrMask{Permissions: true}
 		attr := p9.SetAttr{Permissions: 0777}
 		got, err := SetGetAttr(s.file, valid, attr)
-		if s.ft == symlink {
+		if s.fileType == unix.S_IFLNK {
 			if err == nil {
 				t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
 			}
@@ -345,7 +404,7 @@ func TestSetAttrSize(t *testing.T) {
 			valid := p9.SetAttrMask{Size: true}
 			attr := p9.SetAttr{Size: size}
 			got, err := SetGetAttr(s.file, valid, attr)
-			if s.ft == symlink || s.ft == directory {
+			if s.fileType == unix.S_IFLNK || s.fileType == unix.S_IFDIR {
 				if err == nil {
 					t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
 				}
@@ -427,9 +486,9 @@ func TestLink(t *testing.T) {
 		}
 
 		err = dir.Link(s.file, linkFile)
-		if s.ft == directory {
-			if err != syscall.EPERM {
-				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
+		if s.fileType == unix.S_IFDIR {
+			if err != unix.EPERM {
+				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: unix.EPERM", s, linkFile, err)
 			}
 			return
 		}
@@ -440,54 +499,64 @@ func TestLink(t *testing.T) {
 }
 
 func TestROMountChecks(t *testing.T) {
+	const want = unix.EROFS
+	uid := p9.UID(os.Getuid())
+	gid := p9.GID(os.Getgid())
+
 	runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
-		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
-			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if s.fileType != unix.S_IFLNK {
+			if _, _, _, err := s.file.Open(p9.WriteOnly); err != want {
+				t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
+			}
+			if _, _, _, err := s.file.Open(p9.ReadWrite); err != want {
+				t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
+			}
+			if _, _, _, err := s.file.Open(p9.ReadOnly | p9.OpenTruncate); err != want {
+				t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
+			}
+			f, _, _, err := s.file.Open(p9.ReadOnly)
+			if err != nil {
+				t.Errorf("Open() failed: %v", err)
+			}
+			if f != nil {
+				_ = f.Close()
+			}
 		}
-		if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
-			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+
+		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid); err != want {
+			t.Errorf("Create() should have failed, got: %v, expected: %v", err, want)
 		}
-		if err := s.file.RenameAt("some_file", s.file, "other_file"); err != syscall.EBADF {
-			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.Mkdir("some_dir", 0777, uid, gid); err != want {
+			t.Errorf("MkDir() should have failed, got: %v, expected: %v", err, want)
 		}
-		if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
-			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if err := s.file.RenameAt("some_file", s.file, "other_file"); err != want {
+			t.Errorf("Rename() should have failed, got: %v, expected: %v", err, want)
 		}
-		if err := s.file.UnlinkAt("some_file", 0); err != syscall.EBADF {
-			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if _, err := s.file.Symlink("some_place", "some_symlink", uid, gid); err != want {
+			t.Errorf("Symlink() should have failed, got: %v, expected: %v", err, want)
 		}
-		if err := s.file.Link(s.file, "some_link"); err != syscall.EBADF {
-			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if err := s.file.UnlinkAt("some_file", 0); err != want {
+			t.Errorf("UnlinkAt() should have failed, got: %v, expected: %v", err, want)
 		}
-
-		valid := p9.SetAttrMask{Size: true}
-		attr := p9.SetAttr{Size: 0}
-		if err := s.file.SetAttr(valid, attr); err != syscall.EBADF {
-			t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		if err := s.file.Link(s.file, "some_link"); err != want {
+			t.Errorf("Link() should have failed, got: %v, expected: %v", err, want)
+		}
+		if _, err := s.file.Mknod("some-nod", 0777, 1, 2, uid, gid); err != want {
+			t.Errorf("Mknod() should have failed, got: %v, expected: %v", err, want)
 		}
-	})
-}
-
-func TestROMountPanics(t *testing.T) {
-	conf := Config{ROMount: true, PanicOnWrite: true}
-	runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
-		assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") })
-		assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
-		assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) })
-		assertPanic(t, func() { s.file.Link(s.file, "some_link") })
 
 		valid := p9.SetAttrMask{Size: true}
 		attr := p9.SetAttr{Size: 0}
-		assertPanic(t, func() { s.file.SetAttr(valid, attr) })
+		if err := s.file.SetAttr(valid, attr); err != want {
+			t.Errorf("SetAttr() should have failed, got: %v, expected: %v", err, want)
+		}
 	})
 }
 
 func TestWalkNotFound(t *testing.T) {
-	runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
-		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
-			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
+	runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
+		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT {
+			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: unix.ENOENT", s, "nobody-here", err)
 		}
 	})
 }
@@ -506,7 +575,7 @@ func TestWalkDup(t *testing.T) {
 }
 
 func TestReaddir(t *testing.T) {
-	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		name := "dir"
 		if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
 			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
@@ -631,7 +700,7 @@ func TestAttachInvalidType(t *testing.T) {
 	defer os.RemoveAll(dir)
 
 	fifo := filepath.Join(dir, "fifo")
-	if err := syscall.Mkfifo(fifo, 0755); err != nil {
+	if err := unix.Mkfifo(fifo, 0755); err != nil {
 		t.Fatalf("Mkfifo(%q): %v", fifo, err)
 	}
 
@@ -690,3 +759,63 @@ func TestDoubleAttachError(t *testing.T) {
 		t.Fatalf("Attach should have failed, got %v want non-nil", err)
 	}
 }
+
+func TestTruncate(t *testing.T) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+		child, err := createFile(s.file, "test")
+		if err != nil {
+			t.Fatalf("createFile() failed: %v", err)
+		}
+		defer child.Close()
+		want := []byte("foobar")
+		w, err := child.WriteAt(want, 0)
+		if err != nil {
+			t.Fatalf("Write() failed: %v", err)
+		}
+		if w != len(want) {
+			t.Fatalf("Write() was partial, got: %d, expected: %d", w, len(want))
+		}
+
+		_, l, err := s.file.Walk([]string{"test"})
+		if err != nil {
+			t.Fatalf("Walk(%s) failed: %v", "test", err)
+		}
+		if _, _, _, err := l.Open(p9.ReadOnly | p9.OpenTruncate); err != nil {
+			t.Fatalf("Open() failed: %v", err)
+		}
+		_, mask, attr, err := l.GetAttr(p9.AttrMask{Size: true})
+		if err != nil {
+			t.Fatalf("GetAttr() failed: %v", err)
+		}
+		if !mask.Size {
+			t.Fatalf("GetAttr() didn't return size: %+v", mask)
+		}
+		if attr.Size != 0 {
+			t.Fatalf("truncate didn't work, want: 0, got: %d", attr.Size)
+		}
+	})
+}
+
+func TestMknod(t *testing.T) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+		_, err := s.file.Mknod("test", p9.ModeRegular|0777, 1, 2, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			t.Fatalf("Mknod() failed: %v", err)
+		}
+
+		_, f, err := s.file.Walk([]string{"test"})
+		if err != nil {
+			t.Fatalf("Walk() failed: %v", err)
+		}
+		fd, _, _, err := f.Open(p9.ReadWrite)
+		if err != nil {
+			t.Fatalf("Open() failed: %v", err)
+		}
+		if fd != nil {
+			defer fd.Close()
+		}
+		if err := testReadWrite(f, p9.ReadWrite, nil); err != nil {
+			t.Fatalf("testReadWrite() failed: %v", err)
+		}
+	})
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index 542b54365..f11fea40d 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -15,18 +15,18 @@
 package fsgofer
 
 import (
-	"syscall"
 	"unsafe"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+func utimensat(dirFd int, name string, times [2]unix.Timespec, flags int) error {
 	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
 	// operate directly on 'dirFd' unlike other *at syscalls.
 	var namePtr unsafe.Pointer
 	if name != "" {
-		nameBytes, err := syscall.BytePtrFromString(name)
+		nameBytes, err := unix.BytePtrFromString(name)
 		if err != nil {
 			return err
 		}
@@ -35,8 +35,8 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err
 
 	timesPtr := unsafe.Pointer(&times[0])
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_UTIMENSAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_UTIMENSAT,
 		uintptr(dirFd),
 		uintptr(namePtr),
 		uintptr(timesPtr),
@@ -52,7 +52,7 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err
 func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error {
 	var oldNamePtr unsafe.Pointer
 	if oldName != "" {
-		nameBytes, err := syscall.BytePtrFromString(oldName)
+		nameBytes, err := unix.BytePtrFromString(oldName)
 		if err != nil {
 			return err
 		}
@@ -60,15 +60,15 @@ func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error
 	}
 	var newNamePtr unsafe.Pointer
 	if newName != "" {
-		nameBytes, err := syscall.BytePtrFromString(newName)
+		nameBytes, err := unix.BytePtrFromString(newName)
 		if err != nil {
 			return err
 		}
 		newNamePtr = unsafe.Pointer(nameBytes)
 	}
 
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_RENAMEAT,
+	if _, _, errno := unix.Syscall6(
+		unix.SYS_RENAMEAT,
 		uintptr(oldDirFD),
 		uintptr(oldNamePtr),
 		uintptr(newDirFD),
diff --git a/runsc/main.go b/runsc/main.go
index 920ed84a5..4ce5ebee9 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,357 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Binary runsc is an implementation of the Open Container Initiative Runtime
-// that runs applications inside a sandbox.
+// Binary runsc implements the OCI runtime interface.
 package main
 
 import (
-	"context"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"os"
-	"os/signal"
-	"path/filepath"
-	"strings"
-	"syscall"
-	"time"
-
-	"github.com/google/subcommands"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/runsc/boot"
-	"gvisor.dev/gvisor/runsc/cmd"
-	"gvisor.dev/gvisor/runsc/flag"
-	"gvisor.dev/gvisor/runsc/specutils"
-)
-
-var (
-	// Although these flags are not part of the OCI spec, they are used by
-	// Docker, and thus should not be changed.
-	rootDir     = flag.String("root", "", "root directory for storage of container state.")
-	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.")
-	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
-	debug       = flag.Bool("debug", false, "enable debug logging.")
-	showVersion = flag.Bool("version", false, "show version and exit.")
-	// TODO(gvisor.dev/issue/193): support systemd cgroups
-	systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
-
-	// These flags are unique to runsc, and are used to configure parts of the
-	// system that are not covered by the runtime spec.
-
-	// Debugging flags.
-	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-	panicLog        = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
-	logPackets      = flag.Bool("log-packets", false, "enable network packet logging.")
-	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
-	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
-	panicLogFD      = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
-	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
-	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
-
-	// Debugging flags: strace related
-	strace         = flag.Bool("strace", false, "enable strace.")
-	straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
-	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
-
-	// Flags that control sandbox runtime behavior.
-	platformName       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
-	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	hardwareGSO        = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
-	softwareGSO        = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.")
-	qDisc              = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
-	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
-	fsGoferHostUDS     = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
-	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
-	overlayfsStaleRead = flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
-	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
-	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
-	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
-	netRaw             = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
-	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
-	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
-	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
-	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
-	vfs2Enabled        = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
-
-	// Test flags, not to be used outside tests, ever.
-	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
-	testOnlyTestNameEnv                        = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+	"gvisor.dev/gvisor/runsc/cli"
 )
 
 func main() {
-	// Help and flags commands are generated automatically.
-	help := cmd.NewHelp(subcommands.DefaultCommander)
-	help.Register(new(cmd.Syscalls))
-	subcommands.Register(help, "")
-	subcommands.Register(subcommands.FlagsCommand(), "")
-
-	// Installation helpers.
-	const helperGroup = "helpers"
-	subcommands.Register(new(cmd.Install), helperGroup)
-	subcommands.Register(new(cmd.Uninstall), helperGroup)
-
-	// Register user-facing runsc commands.
-	subcommands.Register(new(cmd.Checkpoint), "")
-	subcommands.Register(new(cmd.Create), "")
-	subcommands.Register(new(cmd.Delete), "")
-	subcommands.Register(new(cmd.Do), "")
-	subcommands.Register(new(cmd.Events), "")
-	subcommands.Register(new(cmd.Exec), "")
-	subcommands.Register(new(cmd.Gofer), "")
-	subcommands.Register(new(cmd.Kill), "")
-	subcommands.Register(new(cmd.List), "")
-	subcommands.Register(new(cmd.Pause), "")
-	subcommands.Register(new(cmd.PS), "")
-	subcommands.Register(new(cmd.Restore), "")
-	subcommands.Register(new(cmd.Resume), "")
-	subcommands.Register(new(cmd.Run), "")
-	subcommands.Register(new(cmd.Spec), "")
-	subcommands.Register(new(cmd.State), "")
-	subcommands.Register(new(cmd.Start), "")
-	subcommands.Register(new(cmd.Wait), "")
-
-	// Register internal commands with the internal group name. This causes
-	// them to be sorted below the user-facing commands with empty group.
-	// The string below will be printed above the commands.
-	const internalGroup = "internal use only"
-	subcommands.Register(new(cmd.Boot), internalGroup)
-	subcommands.Register(new(cmd.Debug), internalGroup)
-	subcommands.Register(new(cmd.Gofer), internalGroup)
-	subcommands.Register(new(cmd.Statefile), internalGroup)
-
-	// All subcommands must be registered before flag parsing.
-	flag.Parse()
-
-	// Are we showing the version?
-	if *showVersion {
-		// The format here is the same as runc.
-		fmt.Fprintf(os.Stdout, "runsc version %s\n", version)
-		fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version)
-		os.Exit(0)
-	}
-
-	// TODO(gvisor.dev/issue/193): support systemd cgroups
-	if *systemdCgroup {
-		fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
-		os.Exit(1)
-	}
-
-	var errorLogger io.Writer
-	if *logFD > -1 {
-		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
-
-	} else if *logFilename != "" {
-		// We must set O_APPEND and not O_TRUNC because Docker passes
-		// the same log file for all commands (and also parses these
-		// log files), so we can't destroy them on each command.
-		var err error
-		errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
-		if err != nil {
-			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
-		}
-	}
-	cmd.ErrorLogger = errorLogger
-
-	platformType := *platformName
-	if _, err := platform.Lookup(platformType); err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	fsAccess, err := boot.MakeFileAccessType(*fileAccess)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	if fsAccess == boot.FileAccessShared && *overlay {
-		cmd.Fatalf("overlay flag is incompatible with shared file access")
-	}
-
-	netType, err := boot.MakeNetworkType(*network)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	wa, err := boot.MakeWatchdogAction(*watchdogAction)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	if *numNetworkChannels <= 0 {
-		cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
-	}
-
-	refsLeakMode, err := boot.MakeRefsLeakMode(*referenceLeakMode)
-	if err != nil {
-		cmd.Fatalf("%v", err)
-	}
-
-	queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc)
-	if err != nil {
-		cmd.Fatalf("%s", err)
-	}
-
-	// Sets the reference leak check mode. Also set it in config below to
-	// propagate it to child processes.
-	refs.SetLeakMode(refsLeakMode)
-
-	// Create a new Config from the flags.
-	conf := &boot.Config{
-		RootDir:            *rootDir,
-		Debug:              *debug,
-		LogFilename:        *logFilename,
-		LogFormat:          *logFormat,
-		DebugLog:           *debugLog,
-		PanicLog:           *panicLog,
-		DebugLogFormat:     *debugLogFormat,
-		FileAccess:         fsAccess,
-		FSGoferHostUDS:     *fsGoferHostUDS,
-		Overlay:            *overlay,
-		Network:            netType,
-		HardwareGSO:        *hardwareGSO,
-		SoftwareGSO:        *softwareGSO,
-		LogPackets:         *logPackets,
-		Platform:           platformType,
-		Strace:             *strace,
-		StraceLogSize:      *straceLogSize,
-		WatchdogAction:     wa,
-		PanicSignal:        *panicSignal,
-		ProfileEnable:      *profile,
-		EnableRaw:          *netRaw,
-		NumNetworkChannels: *numNetworkChannels,
-		Rootless:           *rootless,
-		AlsoLogToStderr:    *alsoLogToStderr,
-		ReferenceLeakMode:  refsLeakMode,
-		OverlayfsStaleRead: *overlayfsStaleRead,
-		CPUNumFromQuota:    *cpuNumFromQuota,
-		VFS2:               *vfs2Enabled,
-		QDisc:              queueingDiscipline,
-		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
-		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
-	}
-	if len(*straceSyscalls) != 0 {
-		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
-	}
-
-	// Set up logging.
-	if *debug {
-		log.SetLevel(log.Debug)
-	}
-
-	// Logging will include the local date and time via the time package.
-	//
-	// On first use, time.Local initializes the local time zone, which
-	// involves opening tzdata files on the host. Since this requires
-	// opening host files, it must be done before syscall filter
-	// installation.
-	//
-	// Generally there will be a log message before filter installation
-	// that will force initialization, but force initialization here in
-	// case that does not occur.
-	_ = time.Local.String()
-
-	subcommand := flag.CommandLine.Arg(0)
-
-	var e log.Emitter
-	if *debugLogFD > -1 {
-		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
-
-		e = newEmitter(*debugLogFormat, f)
-
-	} else if *debugLog != "" {
-		f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */)
-		if err != nil {
-			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
-		}
-		e = newEmitter(*debugLogFormat, f)
-
-	} else {
-		// Stderr is reserved for the application, just discard the logs if no debug
-		// log is specified.
-		e = newEmitter("text", ioutil.Discard)
-	}
-
-	if *panicLogFD > -1 || *debugLogFD > -1 {
-		fd := *panicLogFD
-		if fd < 0 {
-			fd = *debugLogFD
-		}
-		// Quick sanity check to make sure no other commands get passed
-		// a log fd (they should use log dir instead).
-		if subcommand != "boot" && subcommand != "gofer" {
-			cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
-		}
-
-		// If we are the boot process, then we own our stdio FDs and can do what we
-		// want with them. Since Docker and Containerd both eat boot's stderr, we
-		// dup our stderr to the provided log FD so that panics will appear in the
-		// logs, rather than just disappear.
-		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
-			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
-		}
-	} else if *alsoLogToStderr {
-		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
-	}
-
-	log.SetTarget(e)
-
-	log.Infof("***************************")
-	log.Infof("Args: %s", os.Args)
-	log.Infof("Version %s", version)
-	log.Infof("PID: %d", os.Getpid())
-	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
-	log.Infof("Configuration:")
-	log.Infof("\t\tRootDir: %s", conf.RootDir)
-	log.Infof("\t\tPlatform: %v", conf.Platform)
-	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
-	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
-	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
-	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
-	log.Infof("***************************")
-
-	if *testOnlyAllowRunAsCurrentUserWithoutChroot {
-		// SIGTERM is sent to all processes if a test exceeds its
-		// timeout and this case is handled by syscall_test_runner.
-		log.Warningf("Block the TERM signal. This is only safe in tests!")
-		signal.Ignore(syscall.SIGTERM)
-	}
-
-	// Call the subcommand and pass in the configuration.
-	var ws syscall.WaitStatus
-	subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
-	if subcmdCode == subcommands.ExitSuccess {
-		log.Infof("Exiting with status: %v", ws)
-		if ws.Signaled() {
-			// No good way to return it, emulate what the shell does. Maybe raise
-			// signal to self?
-			os.Exit(128 + int(ws.Signal()))
-		}
-		os.Exit(ws.ExitStatus())
-	}
-	// Return an error that is unlikely to be used by the application.
-	log.Warningf("Failure to execute command, err: %v", subcmdCode)
-	os.Exit(128)
-}
-
-func newEmitter(format string, logFile io.Writer) log.Emitter {
-	switch format {
-	case "text":
-		return log.GoogleEmitter{&log.Writer{Next: logFile}}
-	case "json":
-		return log.JSONEmitter{&log.Writer{Next: logFile}}
-	case "json-k8s":
-		return log.K8sJSONEmitter{&log.Writer{Next: logFile}}
-	}
-	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
-	panic("unreachable")
-}
-
-func init() {
-	// Set default root dir to something (hopefully) user-writeable.
-	*rootDir = "/var/run/runsc"
-	if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
-		*rootDir = filepath.Join(runtimeDir, "runsc")
-	}
+	cli.Main(version)
 }
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 035dcd3e3..f0a551a1e 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -26,10 +26,11 @@ go_library(
         "//runsc/boot",
         "//runsc/boot/platforms",
         "//runsc/cgroup",
+        "//runsc/config",
         "//runsc/console",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
         "@com_github_vishvananda_netlink//:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 209bfdb20..8f66dd1f8 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -49,26 +50,26 @@ import (
 //
 // Run the following container to test it:
 //  docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
-func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error {
 	log.Infof("Setting up network")
 
 	switch conf.Network {
-	case boot.NetworkNone:
+	case config.NetworkNone:
 		log.Infof("Network is disabled, create loopback interface only")
 		if err := createDefaultLoopbackInterface(conn); err != nil {
 			return fmt.Errorf("creating default loopback interface: %v", err)
 		}
-	case boot.NetworkSandbox:
+	case config.NetworkSandbox:
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
-		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels, conf.QDisc); err != nil {
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
-	case boot.NetworkHost:
+	case config.NetworkHost:
 		// Nothing to do here.
 	default:
-		return fmt.Errorf("invalid network type: %d", conf.Network)
+		return fmt.Errorf("invalid network type: %v", conf.Network)
 	}
 	return nil
 }
@@ -115,7 +116,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc config.QueueingDiscipline) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
@@ -134,7 +135,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 		return err
 	}
 	if isRoot {
-
 		return fmt.Errorf("cannot run with network enabled in root network namespace")
 	}
 
@@ -197,11 +197,13 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 		}
 
 		link := boot.FDBasedLink{
-			Name:        iface.Name,
-			MTU:         iface.MTU,
-			Routes:      routes,
-			NumChannels: numNetworkChannels,
-			QDisc:       qDisc,
+			Name:              iface.Name,
+			MTU:               iface.MTU,
+			Routes:            routes,
+			TXChecksumOffload: txChecksumOffload,
+			RXChecksumOffload: rxChecksumOffload,
+			NumChannels:       numNetworkChannels,
+			QDisc:             qDisc,
 		}
 
 		// Get the link for the interface.
@@ -307,11 +309,20 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
 	const bufSize = 4 << 20 // 4MB.
 
 	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil {
-		return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err)
+		syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize)
+		sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+
+		if sz < bufSize {
+			log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
+		}
 	}
 
 	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil {
-		return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err)
+		syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize)
+		sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+		if sz < bufSize {
+			log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err)
+		}
 	}
 
 	return &socketEntry{deviceFile, gsoMaxSize}, nil
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 6e1a2af25..c4309feb3 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -41,6 +41,7 @@ import (
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/console"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -71,11 +72,14 @@ type Sandbox struct {
 	// will have it as a child process.
 	child bool
 
-	// status is an exit status of a sandbox process.
-	status syscall.WaitStatus
-
 	// statusMu protects status.
 	statusMu sync.Mutex
+
+	// status is the exit status of a sandbox process. It's only set if the
+	// child==true and the sandbox was waited on. This field allows for multiple
+	// threads to wait on sandbox and get the exit code, since Linux will return
+	// WaitStatus to one of the waiters only.
+	status syscall.WaitStatus
 }
 
 // Args is used to configure a new sandbox.
@@ -116,7 +120,7 @@ type Args struct {
 
 // New creates the sandbox process. The caller must call Destroy() on the
 // sandbox.
-func New(conf *boot.Config, args *Args) (*Sandbox, error) {
+func New(conf *config.Config, args *Args) (*Sandbox, error) {
 	s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
 	// The Cleanup object cleans up partially created sandboxes when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
@@ -180,7 +184,7 @@ func (s *Sandbox) CreateContainer(cid string) error {
 }
 
 // StartRoot starts running the root container process inside the sandbox.
-func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
+func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error {
 	log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -203,7 +207,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
 }
 
 // StartContainer starts running a non-root container inside the sandbox.
-func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error {
 	for _, f := range goferFiles {
 		defer f.Close()
 	}
@@ -232,7 +236,7 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string
 }
 
 // Restore sends the restore call for a container in the sandbox.
-func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error {
+func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *config.Config, filename string) error {
 	log.Debugf("Restore sandbox %q", s.ID)
 
 	rf, err := os.Open(filename)
@@ -344,7 +348,7 @@ func (s *Sandbox) connError(err error) error {
 
 // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
 // command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncFile *os.File) error {
+func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
 	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
 	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
 	nextFD := 3
@@ -477,12 +481,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	cmd.Stderr = nil
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the TTY on the sandbox process.
-	if args.ConsoleSocket != "" {
-		cmd.Args = append(cmd.Args, "--console=true")
-
+	// pty master/replica pair and set the TTY on the sandbox process.
+	if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
 		// console.NewWithSocket will send the master on the given
-		// socket, and return the slave.
+		// socket, and return the replica.
 		tty, err := console.NewWithSocket(args.ConsoleSocket)
 		if err != nil {
 			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
@@ -557,10 +559,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	// Joins the network namespace if network is enabled. the sandbox talks
 	// directly to the host network, which may have been configured in the
 	// namespace.
-	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != boot.NetworkNone {
+	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
 		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
 		nss = append(nss, ns)
-	} else if conf.Network == boot.NetworkHost {
+	} else if conf.Network == config.NetworkHost {
 		log.Infof("Sandbox will be started in the host network namespace")
 	} else {
 		log.Infof("Sandbox will be started in new network namespace")
@@ -570,7 +572,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 	// User namespace depends on the network type. Host network requires to run
 	// inside the user namespace specified in the spec or the current namespace
 	// if none is configured.
-	if conf.Network == boot.NetworkHost {
+	if conf.Network == config.NetworkHost {
 		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
 			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
 			nss = append(nss, userns)
@@ -747,35 +749,47 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 // Wait waits for the containerized process to exit, and returns its WaitStatus.
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
-	var ws syscall.WaitStatus
 
 	if conn, err := s.sandboxConnect(); err != nil {
-		// The sandbox may have exited while before we had a chance to
-		// wait on it.
+		// The sandbox may have exited while before we had a chance to wait on it.
+		// There is nothing we can do for subcontainers. For the init container, we
+		// can try to get the sandbox exit code.
+		if !s.IsRootContainer(cid) {
+			return syscall.WaitStatus(0), err
+		}
 		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 	} else {
 		defer conn.Close()
+
 		// Try the Wait RPC to the sandbox.
+		var ws syscall.WaitStatus
 		err = conn.Call(boot.ContainerWait, &cid, &ws)
 		if err == nil {
 			// It worked!
 			return ws, nil
 		}
+		// See comment above.
+		if !s.IsRootContainer(cid) {
+			return syscall.WaitStatus(0), err
+		}
+
 		// The sandbox may have exited after we connected, but before
 		// or during the Wait RPC.
 		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 	}
 
-	// The sandbox may have already exited, or exited while handling the
-	// Wait RPC. The best we can do is ask Linux what the sandbox exit
-	// status was, since in most cases that will be the same as the
-	// container exit status.
+	// The sandbox may have already exited, or exited while handling the Wait RPC.
+	// The best we can do is ask Linux what the sandbox exit status was, since in
+	// most cases that will be the same as the container exit status.
 	if err := s.waitForStopped(); err != nil {
-		return ws, err
+		return syscall.WaitStatus(0), err
 	}
 	if !s.child {
-		return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable")
+		return syscall.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
 	}
+
+	s.statusMu.Lock()
+	defer s.statusMu.Unlock()
 	return s.status, nil
 }
 
@@ -1014,26 +1028,6 @@ func (s *Sandbox) StopCPUProfile() error {
 	return nil
 }
 
-// GoroutineProfile writes a goroutine profile to the given file.
-func (s *Sandbox) GoroutineProfile(f *os.File) error {
-	log.Debugf("Goroutine profile %q", s.ID)
-	conn, err := s.sandboxConnect()
-	if err != nil {
-		return err
-	}
-	defer conn.Close()
-
-	opts := control.ProfileOpts{
-		FilePayload: urpc.FilePayload{
-			Files: []*os.File{f},
-		},
-	}
-	if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil {
-		return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err)
-	}
-	return nil
-}
-
 // BlockProfile writes a block profile to the given file.
 func (s *Sandbox) BlockProfile(f *os.File) error {
 	log.Debugf("Block profile %q", s.ID)
@@ -1201,7 +1195,7 @@ func deviceFileForPlatform(name string) (*os.File, error) {
 
 // checkBinaryPermissions verifies that the required binary bits are set on
 // the runsc executable.
-func checkBinaryPermissions(conf *boot.Config) error {
+func checkBinaryPermissions(conf *config.Config) error {
 	// All platforms need the other exe bit
 	neededBits := os.FileMode(0001)
 	if conf.Platform == platforms.Ptrace {
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 4ccd77f63..679d8bc8e 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -16,8 +16,10 @@ go_library(
         "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/kernel/auth",
+        "//runsc/config",
         "@com_github_cenkalti_backoff//:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_mohae_deepcopy//:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@com_github_syndtr_gocapability//capability:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
@@ -28,5 +30,5 @@ go_test(
     size = "small",
     srcs = ["specutils_test.go"],
     library = ":specutils",
-    deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"],
+    deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"],
 )
diff --git a/runsc/specutils/seccomp/BUILD b/runsc/specutils/seccomp/BUILD
new file mode 100644
index 000000000..3520f2d6d
--- /dev/null
+++ b/runsc/specutils/seccomp/BUILD
@@ -0,0 +1,34 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "seccomp",
+    srcs = [
+        "audit_amd64.go",
+        "audit_arm64.go",
+        "seccomp.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/bpf",
+        "//pkg/log",
+        "//pkg/seccomp",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/syscalls/linux",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+    ],
+)
+
+go_test(
+    name = "seccomp_test",
+    size = "small",
+    srcs = ["seccomp_test.go"],
+    library = ":seccomp",
+    deps = [
+        "//pkg/binary",
+        "//pkg/bpf",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/specutils/seccomp/audit_amd64.go b/runsc/specutils/seccomp/audit_amd64.go
new file mode 100644
index 000000000..417cf4a7a
--- /dev/null
+++ b/runsc/specutils/seccomp/audit_amd64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package seccomp
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+const (
+	nativeArchAuditNo = linux.AUDIT_ARCH_X86_64
+)
diff --git a/runsc/specutils/seccomp/audit_arm64.go b/runsc/specutils/seccomp/audit_arm64.go
new file mode 100644
index 000000000..b727ceff2
--- /dev/null
+++ b/runsc/specutils/seccomp/audit_arm64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package seccomp
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+const (
+	nativeArchAuditNo = linux.AUDIT_ARCH_AARCH64
+)
diff --git a/runsc/specutils/seccomp/seccomp.go b/runsc/specutils/seccomp/seccomp.go
new file mode 100644
index 000000000..5932f7a41
--- /dev/null
+++ b/runsc/specutils/seccomp/seccomp.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package seccomp implements some features of libseccomp in order to support
+// OCI.
+package seccomp
+
+import (
+	"fmt"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/seccomp"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+var (
+	killThreadAction = linux.SECCOMP_RET_KILL_THREAD
+	trapAction       = linux.SECCOMP_RET_TRAP
+	// runc always returns EPERM as the errorcode for SECCOMP_RET_ERRNO
+	errnoAction = linux.SECCOMP_RET_ERRNO.WithReturnCode(uint16(syscall.EPERM))
+	// runc always returns EPERM as the errorcode for SECCOMP_RET_TRACE
+	traceAction = linux.SECCOMP_RET_TRACE.WithReturnCode(uint16(syscall.EPERM))
+	allowAction = linux.SECCOMP_RET_ALLOW
+)
+
+// BuildProgram generates a bpf program based on the given OCI seccomp
+// config.
+func BuildProgram(s *specs.LinuxSeccomp) (bpf.Program, error) {
+	defaultAction, err := convertAction(s.DefaultAction)
+	if err != nil {
+		return bpf.Program{}, fmt.Errorf("secomp default action: %w", err)
+	}
+	ruleset, err := convertRules(s)
+	if err != nil {
+		return bpf.Program{}, fmt.Errorf("invalid seccomp rules: %w", err)
+	}
+
+	instrs, err := seccomp.BuildProgram(ruleset, defaultAction, killThreadAction)
+	if err != nil {
+		return bpf.Program{}, fmt.Errorf("building seccomp program: %w", err)
+	}
+
+	program, err := bpf.Compile(instrs)
+	if err != nil {
+		return bpf.Program{}, fmt.Errorf("compiling seccomp program: %w", err)
+	}
+
+	return program, nil
+}
+
+// lookupSyscallNo gets the syscall number for the syscall with the given name
+// for the given architecture.
+func lookupSyscallNo(arch uint32, name string) (uint32, error) {
+	var table *kernel.SyscallTable
+	switch arch {
+	case linux.AUDIT_ARCH_X86_64:
+		table = slinux.AMD64
+	case linux.AUDIT_ARCH_AARCH64:
+		table = slinux.ARM64
+	}
+	if table == nil {
+		return 0, fmt.Errorf("unsupported architecture: %d", arch)
+	}
+	n, err := table.LookupNo(name)
+	if err != nil {
+		return 0, err
+	}
+	return uint32(n), nil
+}
+
+// convertAction converts a LinuxSeccompAction to BPFAction
+func convertAction(act specs.LinuxSeccompAction) (linux.BPFAction, error) {
+	// TODO(gvisor.dev/issue/3124): Update specs package to include ActLog and ActKillProcess.
+	switch act {
+	case specs.ActKill:
+		return killThreadAction, nil
+	case specs.ActTrap:
+		return trapAction, nil
+	case specs.ActErrno:
+		return errnoAction, nil
+	case specs.ActTrace:
+		return traceAction, nil
+	case specs.ActAllow:
+		return allowAction, nil
+	default:
+		return 0, fmt.Errorf("invalid action: %v", act)
+	}
+}
+
+// convertRules converts OCI linux seccomp rules into RuleSets that can be used by
+// the seccomp package to build a seccomp program.
+func convertRules(s *specs.LinuxSeccomp) ([]seccomp.RuleSet, error) {
+	// NOTE: Architectures are only really relevant when calling 32bit syscalls
+	// on a 64bit system. Since we don't support that in gVisor anyway, we
+	// ignore Architectures and only test against the native architecture.
+
+	ruleset := []seccomp.RuleSet{}
+
+	for _, syscall := range s.Syscalls {
+		sysRules := seccomp.NewSyscallRules()
+
+		action, err := convertAction(syscall.Action)
+		if err != nil {
+			return nil, err
+		}
+
+		// Args
+		rules, err := convertArgs(syscall.Args)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, name := range syscall.Names {
+			syscallNo, err := lookupSyscallNo(nativeArchAuditNo, name)
+			if err != nil {
+				// If there is an error looking up the syscall number, assume it is
+				// not supported on this architecture and ignore it. This is, for
+				// better or worse, what runc does.
+				log.Warningf("OCI seccomp: ignoring syscall %q", name)
+				continue
+			}
+
+			for _, rule := range rules {
+				sysRules.AddRule(uintptr(syscallNo), rule)
+			}
+		}
+
+		ruleset = append(ruleset, seccomp.RuleSet{
+			Rules:  sysRules,
+			Action: action,
+		})
+	}
+
+	return ruleset, nil
+}
+
+// convertArgs converts an OCI seccomp argument rule to a list of seccomp.Rule.
+func convertArgs(args []specs.LinuxSeccompArg) ([]seccomp.Rule, error) {
+	argCounts := make([]uint, 6)
+
+	for _, arg := range args {
+		if arg.Index > 6 {
+			return nil, fmt.Errorf("invalid index: %d", arg.Index)
+		}
+
+		argCounts[arg.Index]++
+	}
+
+	// NOTE: If multiple rules apply to the same argument (same index) the
+	// action is triggered if any one of the rules matches (OR). If not, then
+	// all rules much match in order to trigger the action (AND). This appears to
+	// be some kind of legacy behavior of runc that nevertheless needs to be
+	// supported to maintain compatibility.
+
+	hasMultipleArgs := false
+	for _, count := range argCounts {
+		if count > 1 {
+			hasMultipleArgs = true
+			break
+		}
+	}
+
+	if hasMultipleArgs {
+		rules := []seccomp.Rule{}
+
+		// Old runc behavior - do this for compatibility.
+		// Add rules as ORs by adding separate Rules.
+		for _, arg := range args {
+			rule := seccomp.Rule{nil, nil, nil, nil, nil, nil}
+
+			if err := convertRule(arg, &rule); err != nil {
+				return nil, err
+			}
+
+			rules = append(rules, rule)
+		}
+
+		return rules, nil
+	}
+
+	// Add rules as ANDs by adding to the same Rule.
+	rule := seccomp.Rule{nil, nil, nil, nil, nil, nil}
+	for _, arg := range args {
+		if err := convertRule(arg, &rule); err != nil {
+			return nil, err
+		}
+	}
+
+	return []seccomp.Rule{rule}, nil
+}
+
+// convertRule converts and adds the arg to a rule.
+func convertRule(arg specs.LinuxSeccompArg, rule *seccomp.Rule) error {
+	switch arg.Op {
+	case specs.OpEqualTo:
+		rule[arg.Index] = seccomp.EqualTo(arg.Value)
+	case specs.OpNotEqual:
+		rule[arg.Index] = seccomp.NotEqual(arg.Value)
+	case specs.OpGreaterThan:
+		rule[arg.Index] = seccomp.GreaterThan(arg.Value)
+	case specs.OpGreaterEqual:
+		rule[arg.Index] = seccomp.GreaterThanOrEqual(arg.Value)
+	case specs.OpLessThan:
+		rule[arg.Index] = seccomp.LessThan(arg.Value)
+	case specs.OpLessEqual:
+		rule[arg.Index] = seccomp.LessThanOrEqual(arg.Value)
+	case specs.OpMaskedEqual:
+		rule[arg.Index] = seccomp.MaskedEqual(uintptr(arg.Value), uintptr(arg.ValueTwo))
+	default:
+		return fmt.Errorf("unsupported operand: %q", arg.Op)
+	}
+	return nil
+}
diff --git a/runsc/specutils/seccomp/seccomp_test.go b/runsc/specutils/seccomp/seccomp_test.go
new file mode 100644
index 000000000..850c237ba
--- /dev/null
+++ b/runsc/specutils/seccomp/seccomp_test.go
@@ -0,0 +1,414 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seccomp
+
+import (
+	"fmt"
+	"syscall"
+	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/bpf"
+)
+
+type seccompData struct {
+	nr                 uint32
+	arch               uint32
+	instructionPointer uint64
+	args               [6]uint64
+}
+
+// asInput converts a seccompData to a bpf.Input.
+func asInput(d seccompData) bpf.Input {
+	return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian}
+}
+
+// testInput creates an Input struct with given seccomp input values.
+func testInput(arch uint32, syscallName string, args *[6]uint64) bpf.Input {
+	syscallNo, err := lookupSyscallNo(arch, syscallName)
+	if err != nil {
+		// Assume tests set valid syscall names.
+		panic(err)
+	}
+
+	if args == nil {
+		argArray := [6]uint64{0, 0, 0, 0, 0, 0}
+		args = &argArray
+	}
+
+	data := seccompData{
+		nr:   syscallNo,
+		arch: arch,
+		args: *args,
+	}
+
+	return asInput(data)
+}
+
+// testCase holds a seccomp test case.
+type testCase struct {
+	name     string
+	config   specs.LinuxSeccomp
+	input    bpf.Input
+	expected uint32
+}
+
+var (
+	// seccompTests is a list of speccomp test cases.
+	seccompTests = []testCase{
+		{
+			name: "default_allow",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+			},
+			input:    testInput(nativeArchAuditNo, "read", nil),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "default_deny",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActErrno,
+			},
+			input:    testInput(nativeArchAuditNo, "read", nil),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "deny_arch",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getcwd",
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			// Syscall matches but the arch is AUDIT_ARCH_X86 so the return
+			// value is the bad arch action.
+			input:    asInput(seccompData{nr: 183, arch: 0x40000003}), //
+			expected: uint32(killThreadAction),
+		},
+		{
+			name: "match_name_errno",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getcwd",
+							"chmod",
+						},
+						Action: specs.ActErrno,
+					},
+					{
+						Names: []string{
+							"write",
+						},
+						Action: specs.ActTrace,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "getcwd", nil),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "match_name_trace",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getcwd",
+							"chmod",
+						},
+						Action: specs.ActErrno,
+					},
+					{
+						Names: []string{
+							"write",
+						},
+						Action: specs.ActTrace,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "write", nil),
+			expected: uint32(traceAction),
+		},
+		{
+			name: "no_match_name_allow",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getcwd",
+							"chmod",
+						},
+						Action: specs.ActErrno,
+					},
+					{
+						Names: []string{
+							"write",
+						},
+						Action: specs.ActTrace,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "openat", nil),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "simple_match_args",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 0,
+								Value: syscall.CLONE_FS,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "match_args_or",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 0,
+								Value: syscall.CLONE_FS,
+								Op:    specs.OpEqualTo,
+							},
+							{
+								Index: 0,
+								Value: syscall.CLONE_VM,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "match_args_and",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getsockopt",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 1,
+								Value: syscall.SOL_SOCKET,
+								Op:    specs.OpEqualTo,
+							},
+							{
+								Index: 2,
+								Value: syscall.SO_PEERCRED,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET, syscall.SO_PEERCRED}),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "no_match_args_and",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"getsockopt",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 1,
+								Value: syscall.SOL_SOCKET,
+								Op:    specs.OpEqualTo,
+							},
+							{
+								Index: 2,
+								Value: syscall.SO_PEERCRED,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET}),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "Simple args (no match)",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index: 0,
+								Value: syscall.CLONE_FS,
+								Op:    specs.OpEqualTo,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_VM}),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "OpMaskedEqual (match)",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index:    0,
+								Value:    syscall.CLONE_FS,
+								ValueTwo: syscall.CLONE_FS,
+								Op:       specs.OpMaskedEqual,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS | syscall.CLONE_VM}),
+			expected: uint32(errnoAction),
+		},
+		{
+			name: "OpMaskedEqual (no match)",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActAllow,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index:    0,
+								Value:    syscall.CLONE_FS | syscall.CLONE_VM,
+								ValueTwo: syscall.CLONE_FS | syscall.CLONE_VM,
+								Op:       specs.OpMaskedEqual,
+							},
+						},
+						Action: specs.ActErrno,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+			expected: uint32(allowAction),
+		},
+		{
+			name: "OpMaskedEqual (clone)",
+			config: specs.LinuxSeccomp{
+				DefaultAction: specs.ActErrno,
+				Syscalls: []specs.LinuxSyscall{
+					{
+						Names: []string{
+							"clone",
+						},
+						// This comes from the Docker default seccomp
+						// profile for clone.
+						Args: []specs.LinuxSeccompArg{
+							{
+								Index:    0,
+								Value:    0x7e020000,
+								ValueTwo: 0x0,
+								Op:       specs.OpMaskedEqual,
+							},
+						},
+						Action: specs.ActAllow,
+					},
+				},
+			},
+			input:    testInput(nativeArchAuditNo, "clone", &[6]uint64{0x50f00}),
+			expected: uint32(allowAction),
+		},
+	}
+)
+
+// TestRunscSeccomp generates seccomp programs from OCI config and executes
+// them using runsc's library, comparing against expected results.
+func TestRunscSeccomp(t *testing.T) {
+	for _, tc := range seccompTests {
+		t.Run(tc.name, func(t *testing.T) {
+			runscProgram, err := BuildProgram(&tc.config)
+			if err != nil {
+				t.Fatalf("generating runsc BPF: %v", err)
+			}
+
+			if err := checkProgram(runscProgram, tc.input, tc.expected); err != nil {
+				t.Fatalf("running runsc BPF: %v", err)
+			}
+		})
+	}
+}
+
+// checkProgram runs the given program over the given input and checks the
+// result against the expected output.
+func checkProgram(p bpf.Program, in bpf.Input, expected uint32) error {
+	result, err := bpf.Exec(p, in)
+	if err != nil {
+		return err
+	}
+
+	if result != expected {
+		// Include a decoded version of the program in output for debugging purposes.
+		decoded, _ := bpf.DecodeProgram(p)
+		return fmt.Errorf("Unexpected result: got: %d, expected: %d\nBPF Program\n%s", result, expected, decoded)
+	}
+
+	return nil
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index f1fa573c5..45abc1425 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -29,11 +29,13 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
+	"github.com/mohae/deepcopy"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 // ExePath must point to runsc binary, which is normally the same binary. It's
@@ -44,20 +46,31 @@ var ExePath = "/proc/self/exe"
 var Version = specs.Version
 
 // LogSpec logs the spec in a human-friendly way.
-func LogSpec(spec *specs.Spec) {
-	log.Debugf("Spec: %+v", spec)
-	log.Debugf("Spec.Hooks: %+v", spec.Hooks)
-	log.Debugf("Spec.Linux: %+v", spec.Linux)
-	if spec.Linux != nil && spec.Linux.Resources != nil {
-		res := spec.Linux.Resources
-		log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory)
-		log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU)
-		log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO)
-		log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network)
-	}
-	log.Debugf("Spec.Process: %+v", spec.Process)
-	log.Debugf("Spec.Root: %+v", spec.Root)
-	log.Debugf("Spec.Mounts: %+v", spec.Mounts)
+func LogSpec(orig *specs.Spec) {
+	if !log.IsLogging(log.Debug) {
+		return
+	}
+
+	// Strip down parts of the spec that are not interesting.
+	spec := deepcopy.Copy(orig).(*specs.Spec)
+	if spec.Process != nil {
+		spec.Process.Capabilities = nil
+	}
+	if spec.Linux != nil {
+		spec.Linux.Seccomp = nil
+		spec.Linux.MaskedPaths = nil
+		spec.Linux.ReadonlyPaths = nil
+		if spec.Linux.Resources != nil {
+			spec.Linux.Resources.Devices = nil
+		}
+	}
+
+	out, err := json.MarshalIndent(spec, "", "  ")
+	if err != nil {
+		log.Debugf("Failed to marshal spec: %v", err)
+		return
+	}
+	log.Debugf("Spec:\n%s", out)
 }
 
 // ValidateSpec validates that the spec is compatible with runsc.
@@ -98,11 +111,6 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
 	}
 
-	// TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
-	if spec.Linux != nil && spec.Linux.Seccomp != nil {
-		log.Warningf("Seccomp spec is being ignored")
-	}
-
 	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
 		if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
 			return err
@@ -149,18 +157,18 @@ func OpenSpec(bundleDir string) (*os.File, error) {
 // ReadSpec reads an OCI runtime spec from the given bundle directory.
 // ReadSpec also normalizes all potential relative paths into absolute
 // path, e.g. spec.Root.Path, mount.Source.
-func ReadSpec(bundleDir string) (*specs.Spec, error) {
+func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) {
 	specFile, err := OpenSpec(bundleDir)
 	if err != nil {
 		return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
 	}
 	defer specFile.Close()
-	return ReadSpecFromFile(bundleDir, specFile)
+	return ReadSpecFromFile(bundleDir, specFile, conf)
 }
 
 // ReadSpecFromFile reads an OCI runtime spec from the given File, and
 // normalizes all relative paths into absolute by prepending the bundle dir.
-func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) {
+func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) {
 	if _, err := specFile.Seek(0, os.SEEK_SET); err != nil {
 		return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
 	}
@@ -183,6 +191,20 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error)
 			m.Source = absPath(bundleDir, m.Source)
 		}
 	}
+
+	// Override flags using annotation to allow customization per sandbox
+	// instance.
+	for annotation, val := range spec.Annotations {
+		const flagPrefix = "dev.gvisor.flag."
+		if strings.HasPrefix(annotation, flagPrefix) {
+			name := annotation[len(flagPrefix):]
+			log.Infof("Overriding flag: %s=%q", name, val)
+			if err := conf.Override(name, val); err != nil {
+				return nil, err
+			}
+		}
+	}
+
 	return &spec, nil
 }
 
@@ -322,15 +344,9 @@ func IsSupportedDevMount(m specs.Mount) bool {
 	var existingDevices = []string{
 		"/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr",
 		"/dev/null", "/dev/zero", "/dev/full", "/dev/random",
-		"/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx",
+		"/dev/urandom", "/dev/shm", "/dev/ptmx",
 	}
 	dst := filepath.Clean(m.Destination)
-	if dst == "/dev" {
-		// OCI spec uses many different mounts for the things inside of '/dev'. We
-		// have a single mount at '/dev' that is always mounted, regardless of
-		// whether it was asked for, as the spec says we SHOULD.
-		return false
-	}
 	for _, dev := range existingDevices {
 		if dst == dev || strings.HasPrefix(dst, dev+"/") {
 			return false
@@ -403,7 +419,7 @@ func Mount(src, dst, typ string, flags uint32) error {
 		// Special case, as there is no source directory for proc mounts.
 		isDir = true
 	} else if fi, err := os.Stat(src); err != nil {
-		return fmt.Errorf("Stat(%q) failed: %v", src, err)
+		return fmt.Errorf("stat(%q) failed: %v", src, err)
 	} else {
 		isDir = fi.IsDir()
 	}
@@ -411,25 +427,25 @@ func Mount(src, dst, typ string, flags uint32) error {
 	if isDir {
 		// Create the destination directory.
 		if err := os.MkdirAll(dst, 0777); err != nil {
-			return fmt.Errorf("Mkdir(%q) failed: %v", dst, err)
+			return fmt.Errorf("mkdir(%q) failed: %v", dst, err)
 		}
 	} else {
 		// Create the parent destination directory.
 		parent := path.Dir(dst)
 		if err := os.MkdirAll(parent, 0777); err != nil {
-			return fmt.Errorf("Mkdir(%q) failed: %v", parent, err)
+			return fmt.Errorf("mkdir(%q) failed: %v", parent, err)
 		}
 		// Create the destination file if it does not exist.
 		f, err := os.OpenFile(dst, syscall.O_CREAT, 0777)
 		if err != nil {
-			return fmt.Errorf("Open(%q) failed: %v", dst, err)
+			return fmt.Errorf("open(%q) failed: %v", dst, err)
 		}
 		f.Close()
 	}
 
 	// Do the mount.
 	if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil {
-		return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err)
+		return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err)
 	}
 	return nil
 }