14 files changed, 1040 insertions, 1056 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index aad2a41de..2d9517f4a 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -8,7 +8,6 @@ go_library(
         "compat.go",
         "compat_amd64.go",
         "compat_arm64.go",
-        "config.go",
         "controller.go",
         "debug.go",
         "events.go",
@@ -27,10 +26,13 @@ go_library(
     deps = [
         "//pkg/abi",
         "//pkg/abi/linux",
+        "//pkg/bpf",
+        "//pkg/cleanup",
         "//pkg/context",
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
@@ -90,6 +92,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/link/fdbased",
         "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/packetsocket",
         "//pkg/tcpip/link/qdisc/fifo",
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/network/arp",
@@ -104,9 +107,11 @@ go_library(
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
         "//runsc/boot/pprof",
+        "//runsc/config",
         "//runsc/specutils",
+        "//runsc/specutils/seccomp",
         "@com_github_golang_protobuf//proto:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -122,6 +127,7 @@ go_test(
     library = ":boot",
     deps = [
         "//pkg/control/server",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
@@ -130,8 +136,9 @@ go_test(
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/unet",
+        "//runsc/config",
         "//runsc/fsgofer",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
deleted file mode 100644
index bb01b8fb5..000000000
--- a/runsc/boot/config.go
+++ /dev/null
@@ -1,329 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"fmt"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/watchdog"
-)
-
-// FileAccessType tells how the filesystem is accessed.
-type FileAccessType int
-
-const (
-	// FileAccessShared sends IO requests to a Gofer process that validates the
-	// requests and forwards them to the host.
-	FileAccessShared FileAccessType = iota
-
-	// FileAccessExclusive is the same as FileAccessShared, but enables
-	// extra caching for improved performance. It should only be used if
-	// the sandbox has exclusive access to the filesystem.
-	FileAccessExclusive
-)
-
-// MakeFileAccessType converts type from string.
-func MakeFileAccessType(s string) (FileAccessType, error) {
-	switch s {
-	case "shared":
-		return FileAccessShared, nil
-	case "exclusive":
-		return FileAccessExclusive, nil
-	default:
-		return 0, fmt.Errorf("invalid file access type %q", s)
-	}
-}
-
-func (f FileAccessType) String() string {
-	switch f {
-	case FileAccessShared:
-		return "shared"
-	case FileAccessExclusive:
-		return "exclusive"
-	default:
-		return fmt.Sprintf("unknown(%d)", f)
-	}
-}
-
-// NetworkType tells which network stack to use.
-type NetworkType int
-
-const (
-	// NetworkSandbox uses internal network stack, isolated from the host.
-	NetworkSandbox NetworkType = iota
-
-	// NetworkHost redirects network related syscalls to the host network.
-	NetworkHost
-
-	// NetworkNone sets up just loopback using netstack.
-	NetworkNone
-)
-
-// MakeNetworkType converts type from string.
-func MakeNetworkType(s string) (NetworkType, error) {
-	switch s {
-	case "sandbox":
-		return NetworkSandbox, nil
-	case "host":
-		return NetworkHost, nil
-	case "none":
-		return NetworkNone, nil
-	default:
-		return 0, fmt.Errorf("invalid network type %q", s)
-	}
-}
-
-func (n NetworkType) String() string {
-	switch n {
-	case NetworkSandbox:
-		return "sandbox"
-	case NetworkHost:
-		return "host"
-	case NetworkNone:
-		return "none"
-	default:
-		return fmt.Sprintf("unknown(%d)", n)
-	}
-}
-
-// MakeWatchdogAction converts type from string.
-func MakeWatchdogAction(s string) (watchdog.Action, error) {
-	switch strings.ToLower(s) {
-	case "log", "logwarning":
-		return watchdog.LogWarning, nil
-	case "panic":
-		return watchdog.Panic, nil
-	default:
-		return 0, fmt.Errorf("invalid watchdog action %q", s)
-	}
-}
-
-// MakeRefsLeakMode converts type from string.
-func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
-	switch strings.ToLower(s) {
-	case "disabled":
-		return refs.NoLeakChecking, nil
-	case "log-names":
-		return refs.LeaksLogWarning, nil
-	case "log-traces":
-		return refs.LeaksLogTraces, nil
-	default:
-		return 0, fmt.Errorf("invalid refs leakmode %q", s)
-	}
-}
-
-func refsLeakModeToString(mode refs.LeakMode) string {
-	switch mode {
-	// If not set, default it to disabled.
-	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
-		return "disabled"
-	case refs.LeaksLogWarning:
-		return "log-names"
-	case refs.LeaksLogTraces:
-		return "log-traces"
-	default:
-		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
-	}
-}
-
-// Config holds configuration that is not part of the runtime spec.
-type Config struct {
-	// RootDir is the runtime root directory.
-	RootDir string
-
-	// Debug indicates that debug logging should be enabled.
-	Debug bool
-
-	// LogFilename is the filename to log to, if not empty.
-	LogFilename string
-
-	// LogFormat is the log format.
-	LogFormat string
-
-	// DebugLog is the path to log debug information to, if not empty.
-	DebugLog string
-
-	// PanicLog is the path to log GO's runtime messages, if not empty.
-	PanicLog string
-
-	// DebugLogFormat is the log format for debug.
-	DebugLogFormat string
-
-	// FileAccess indicates how the filesystem is accessed.
-	FileAccess FileAccessType
-
-	// Overlay is whether to wrap the root filesystem in an overlay.
-	Overlay bool
-
-	// FSGoferHostUDS enables the gofer to mount a host UDS.
-	FSGoferHostUDS bool
-
-	// Network indicates what type of network to use.
-	Network NetworkType
-
-	// EnableRaw indicates whether raw sockets should be enabled. Raw
-	// sockets are disabled by stripping CAP_NET_RAW from the list of
-	// capabilities.
-	EnableRaw bool
-
-	// HardwareGSO indicates that hardware segmentation offload is enabled.
-	HardwareGSO bool
-
-	// SoftwareGSO indicates that software segmentation offload is enabled.
-	SoftwareGSO bool
-
-	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
-	TXChecksumOffload bool
-
-	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
-	RXChecksumOffload bool
-
-	// QDisc indicates the type of queuening discipline to use by default
-	// for non-loopback interfaces.
-	QDisc QueueingDiscipline
-
-	// LogPackets indicates that all network packets should be logged.
-	LogPackets bool
-
-	// Platform is the platform to run on.
-	Platform string
-
-	// Strace indicates that strace should be enabled.
-	Strace bool
-
-	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
-	// true and this list is empty, then all syscalls will be traced.
-	StraceSyscalls []string
-
-	// StraceLogSize is the max size of data blobs to display.
-	StraceLogSize uint
-
-	// DisableSeccomp indicates whether seccomp syscall filters should be
-	// disabled. Pardon the double negation, but default to enabled is important.
-	DisableSeccomp bool
-
-	// WatchdogAction sets what action the watchdog takes when triggered.
-	WatchdogAction watchdog.Action
-
-	// PanicSignal registers signal handling that panics. Usually set to
-	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
-	PanicSignal int
-
-	// ProfileEnable is set to prepare the sandbox to be profiled.
-	ProfileEnable bool
-
-	// RestoreFile is the path to the saved container image
-	RestoreFile string
-
-	// NumNetworkChannels controls the number of AF_PACKET sockets that map
-	// to the same underlying network device. This allows netstack to better
-	// scale for high throughput use cases.
-	NumNetworkChannels int
-
-	// Rootless allows the sandbox to be started with a user that is not root.
-	// Defense is depth measures are weaker with rootless. Specifically, the
-	// sandbox and Gofer process run as root inside a user namespace with root
-	// mapped to the caller's user.
-	Rootless bool
-
-	// AlsoLogToStderr allows to send log messages to stderr.
-	AlsoLogToStderr bool
-
-	// ReferenceLeakMode sets reference leak check mode
-	ReferenceLeakMode refs.LeakMode
-
-	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
-	// is on a Linux overlayfs mount, which does not necessarily preserve
-	// coherence between read-only and subsequent writable file descriptors
-	// representing the "same" file.
-	OverlayfsStaleRead bool
-
-	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
-	// tests. It allows runsc to start the sandbox process as the current
-	// user, and without chrooting the sandbox process. This can be
-	// necessary in test environments that have limited capabilities.
-	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
-
-	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
-	// test name in the container environment variables and adds it to the debug
-	// log file name. This is done to help identify the log with the test when
-	// multiple tests are run in parallel, since there is no way to pass
-	// parameters to the runtime from docker.
-	TestOnlyTestNameEnv string
-
-	// CPUNumFromQuota sets CPU number count to available CPU quota, using
-	// least integer value greater than or equal to quota.
-	//
-	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
-	CPUNumFromQuota bool
-
-	// Enables VFS2 (not plumbled through yet).
-	VFS2 bool
-}
-
-// ToFlags returns a slice of flags that correspond to the given Config.
-func (c *Config) ToFlags() []string {
-	f := []string{
-		"--root=" + c.RootDir,
-		"--debug=" + strconv.FormatBool(c.Debug),
-		"--log=" + c.LogFilename,
-		"--log-format=" + c.LogFormat,
-		"--debug-log=" + c.DebugLog,
-		"--panic-log=" + c.PanicLog,
-		"--debug-log-format=" + c.DebugLogFormat,
-		"--file-access=" + c.FileAccess.String(),
-		"--overlay=" + strconv.FormatBool(c.Overlay),
-		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
-		"--network=" + c.Network.String(),
-		"--log-packets=" + strconv.FormatBool(c.LogPackets),
-		"--platform=" + c.Platform,
-		"--strace=" + strconv.FormatBool(c.Strace),
-		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
-		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
-		"--watchdog-action=" + c.WatchdogAction.String(),
-		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
-		"--profile=" + strconv.FormatBool(c.ProfileEnable),
-		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
-		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
-		"--rootless=" + strconv.FormatBool(c.Rootless),
-		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
-		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
-		"--gso=" + strconv.FormatBool(c.HardwareGSO),
-		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
-		"--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
-		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
-		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
-		"--qdisc=" + c.QDisc.String(),
-	}
-	if c.CPUNumFromQuota {
-		f = append(f, "--cpu-num-from-quota")
-	}
-	// Only include these if set since it is never to be used by users.
-	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		f = append(f, "--TESTONLY-unsafe-nonroot=true")
-	}
-	if len(c.TestOnlyTestNameEnv) != 0 {
-		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
-	}
-
-	if c.VFS2 {
-		f = append(f, "--vfs2=true")
-	}
-
-	return f
-}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 8125d5061..894651519 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,6 +22,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -33,6 +34,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -101,14 +103,13 @@ const (
 
 // Profiling related commands (see pprof.go for more details).
 const (
-	StartCPUProfile  = "Profile.StartCPUProfile"
-	StopCPUProfile   = "Profile.StopCPUProfile"
-	HeapProfile      = "Profile.HeapProfile"
-	GoroutineProfile = "Profile.GoroutineProfile"
-	BlockProfile     = "Profile.BlockProfile"
-	MutexProfile     = "Profile.MutexProfile"
-	StartTrace       = "Profile.StartTrace"
-	StopTrace        = "Profile.StopTrace"
+	StartCPUProfile = "Profile.StartCPUProfile"
+	StopCPUProfile  = "Profile.StopCPUProfile"
+	HeapProfile     = "Profile.HeapProfile"
+	BlockProfile    = "Profile.BlockProfile"
+	MutexProfile    = "Profile.MutexProfile"
+	StartTrace      = "Profile.StartTrace"
+	StopTrace       = "Profile.StopTrace"
 )
 
 // Logging related commands (see logging.go for more details).
@@ -129,42 +130,52 @@ type controller struct {
 
 	// manager holds the containerManager methods.
 	manager *containerManager
+
+	// pprop holds the profile instance if enabled. It may be nil.
+	pprof *control.Profile
 }
 
 // newController creates a new controller. The caller must call
 // controller.srv.StartServing() to start the controller.
 func newController(fd int, l *Loader) (*controller, error) {
-	srv, err := server.CreateFromFD(fd)
+	ctrl := &controller{}
+	var err error
+	ctrl.srv, err = server.CreateFromFD(fd)
 	if err != nil {
 		return nil, err
 	}
 
-	manager := &containerManager{
+	ctrl.manager = &containerManager{
 		startChan:       make(chan struct{}),
 		startResultChan: make(chan error),
 		l:               l,
 	}
-	srv.Register(manager)
+	ctrl.srv.Register(ctrl.manager)
 
 	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
-		srv.Register(net)
+		ctrl.srv.Register(net)
 	}
 
-	srv.Register(&debug{})
-	srv.Register(&control.Logging{})
-	if l.conf.ProfileEnable {
-		srv.Register(&control.Profile{
-			Kernel: l.k,
-		})
+	ctrl.srv.Register(&debug{})
+	ctrl.srv.Register(&control.Logging{})
+
+	if l.root.conf.ProfileEnable {
+		ctrl.pprof = &control.Profile{Kernel: l.k}
+		ctrl.srv.Register(ctrl.pprof)
 	}
 
-	return &controller{
-		srv:     srv,
-		manager: manager,
-	}, nil
+	return ctrl, nil
+}
+
+func (c *controller) stop() {
+	if c.pprof != nil {
+		// These are noop if there is nothing being profiled.
+		_ = c.pprof.StopCPUProfile(nil, nil)
+		_ = c.pprof.StopTrace(nil, nil)
+	}
 }
 
 // containerManager manages sandbox containers.
@@ -211,7 +222,7 @@ type StartArgs struct {
 	Spec *specs.Spec
 
 	// Config is the runsc-specific configuration for the sandbox.
-	Conf *Config
+	Conf *config.Config
 
 	// CID is the ID of the container to start.
 	CID string
@@ -247,13 +258,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	// All validation passed, logs the spec for debugging.
 	specutils.LogSpec(args.Spec)
 
-	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	fds, err := fd.NewFromFiles(args.FilePayload.Files)
 	if err != nil {
+		return err
+	}
+	defer func() {
+		for _, fd := range fds {
+			_ = fd.Close()
+		}
+	}()
+	if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
 	}
 	log.Debugf("Container %q started", args.CID)
-
 	return nil
 }
 
@@ -333,7 +351,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	// Pause the kernel while we build a new one.
 	cm.l.k.Pause()
 
-	p, err := createPlatform(cm.l.conf, deviceFile)
+	p, err := createPlatform(cm.l.root.conf, deviceFile)
 	if err != nil {
 		return fmt.Errorf("creating platform: %v", err)
 	}
@@ -349,8 +367,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
-	mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
-	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
+	mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
+	renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
 	if err != nil {
 		return fmt.Errorf("creating RestoreEnvironment: %v", err)
 	}
@@ -368,7 +386,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("file cannot be empty")
 	}
 
-	if cm.l.conf.ProfileEnable {
+	if cm.l.root.conf.ProfileEnable {
 		// pprof.Initialize opens /proc/self/maps, so has to be called before
 		// installing seccomp filters.
 		pprof.Initialize()
@@ -387,13 +405,13 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 
 	// Since we have a new kernel we also must make a new watchdog.
 	dogOpts := watchdog.DefaultOpts
-	dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+	dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction
 	dog := watchdog.New(k, dogOpts)
 
 	// Change the loader fields to reflect the changes made when restoring.
 	cm.l.k = k
 	cm.l.watchdog = dog
-	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+	cm.l.root.procArgs = kernel.CreateProcessArgs{}
 	cm.l.restore = true
 
 	// Reinitialize the sandbox ID and processes map. Note that it doesn't
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 60e33425f..6ac19668f 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -27,41 +27,30 @@ import (
 // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_CLOCK_GETTIME: {},
-	syscall.SYS_CLONE: []seccomp.Rule{
-		{
-			seccomp.AllowValue(
-				syscall.CLONE_VM |
-					syscall.CLONE_FS |
-					syscall.CLONE_FILES |
-					syscall.CLONE_SIGHAND |
-					syscall.CLONE_SYSVSEM |
-					syscall.CLONE_THREAD),
-		},
-	},
-	syscall.SYS_CLOSE: {},
-	syscall.SYS_DUP:   {},
+	syscall.SYS_CLOSE:         {},
+	syscall.SYS_DUP:           {},
 	syscall.SYS_DUP3: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.O_CLOEXEC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.O_CLOEXEC),
 		},
 	},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EVENTFD2: []seccomp.Rule{
 		{
-			seccomp.AllowValue(0),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(0),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EXIT:       {},
@@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FCHMOD:     {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_SETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_SETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFD),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFD),
 		},
 	},
 	syscall.SYS_FSTAT:     {},
@@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FTRUNCATE: {},
 	syscall.SYS_FUTEX: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
 		},
 		// Non-private variants are included for flipcall support. They are otherwise
 		// unncessary, as the sentry will use only private futexes internally.
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE),
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE),
+			seccomp.MatchAny{},
 		},
 	},
 	syscall.SYS_GETPID: {},
 	unix.SYS_GETRANDOM: {},
 	syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_DOMAIN),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_DOMAIN),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_TYPE),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_TYPE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_ERROR),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_ERROR),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_SNDBUF),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_SNDBUF),
 		},
 	},
 	syscall.SYS_GETTID:       {},
@@ -141,34 +130,34 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// setting/getting termios and winsize.
 	syscall.SYS_IOCTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCGETS),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCGETS),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETS),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETS),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETSF),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETSF),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETSW),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETSW),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TIOCSWINSZ),
-			seccomp.AllowAny{}, /* winsize struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TIOCSWINSZ),
+			seccomp.MatchAny{}, /* winsize struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TIOCGWINSZ),
-			seccomp.AllowAny{}, /* winsize struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TIOCGWINSZ),
+			seccomp.MatchAny{}, /* winsize struct */
 		},
 	},
 	syscall.SYS_LSEEK:   {},
@@ -182,46 +171,46 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// TODO(b/148688965): Remove once this is gone from Go.
 	syscall.SYS_MLOCK: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(4096),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(4096),
 		},
 	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_SHARED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_SHARED),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ),
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
 		},
 	},
 	syscall.SYS_MPROTECT:  {},
@@ -237,32 +226,32 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_READ:      {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
 		},
 	},
 	syscall.SYS_RECVMMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
-			seccomp.AllowValue(syscall.MSG_DONTWAIT),
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(fdbased.MaxMsgsPerRecv),
+			seccomp.EqualTo(syscall.MSG_DONTWAIT),
+			seccomp.EqualTo(0),
 		},
 	},
 	unix.SYS_SENDMMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT),
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_RESTART_SYSCALL: {},
@@ -272,57 +261,50 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SCHED_YIELD:     {},
 	syscall.SYS_SENDMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
 		},
 	},
 	syscall.SYS_SETITIMER: {},
 	syscall.SYS_SHUTDOWN: []seccomp.Rule{
 		// Used by fs/host to shutdown host sockets.
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)},
 		// Used by unet to shutdown connections.
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
 	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
 	syscall.SYS_TEE: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(1),                      /* len */
-			seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(1),                      /* len */
+			seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */
 		},
 	},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
-			seccomp.AllowValue(uint64(os.Getpid())),
+			seccomp.EqualTo(uint64(os.Getpid())),
 		},
 	},
 	syscall.SYS_UTIMENSAT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0), /* null pathname */
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0), /* flags */
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0), /* null pathname */
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0), /* flags */
 		},
 	},
 	syscall.SYS_WRITE: {},
-	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
-	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
-	// option is enabled for a packet socket.
+	// For rawfile.NonBlockingWriteIovec.
 	syscall.SYS_WRITEV: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(2),
-		},
-		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(3),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.GreaterThan(0),
 		},
 	},
 }
@@ -332,10 +314,10 @@ func hostInetFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ACCEPT4: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
 			},
 		},
 		syscall.SYS_BIND:        {},
@@ -344,84 +326,84 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_GETSOCKNAME: {},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_TOS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVTOS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_TCLASS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_ERROR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_ERROR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_KEEPALIVE),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_KEEPALIVE),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_SNDBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_RCVBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_REUSEADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_TYPE),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_TYPE),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_LINGER),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_LINGER),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_NODELAY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_NODELAY),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_INFO),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_INFO),
 			},
 		},
 		syscall.SYS_IOCTL: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.TIOCOUTQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.TIOCOUTQ),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.TIOCINQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.TIOCINQ),
 			},
 		},
 		syscall.SYS_LISTEN:   {},
@@ -432,103 +414,103 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_SENDTO:   {},
 		syscall.SYS_SETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_V6ONLY),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_SNDBUF),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_RCVBUF),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_REUSEADDR),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_NODELAY),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_NODELAY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_TOS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_TOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_RECVTOS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVTOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_TCLASS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_TCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 		},
 		syscall.SYS_SHUTDOWN: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_RD),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_RD),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_WR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_WR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_RDWR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_RDWR),
 			},
 		},
 		syscall.SYS_SOCKET: []seccomp.Rule{
 			{
-				seccomp.AllowValue(syscall.AF_INET),
-				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET),
+				seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET),
-				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET),
+				seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET6),
-				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET6),
+				seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET6),
-				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET6),
+				seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 		},
 		syscall.SYS_WRITEV: {},
@@ -539,20 +521,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ACCEPT: []seccomp.Rule{
 			{
-				seccomp.AllowValue(fd),
+				seccomp.EqualTo(fd),
 			},
 		},
 		syscall.SYS_LISTEN: []seccomp.Rule{
 			{
-				seccomp.AllowValue(fd),
-				seccomp.AllowValue(16 /* unet.backlog */),
+				seccomp.EqualTo(fd),
+				seccomp.EqualTo(16 /* unet.backlog */),
 			},
 		},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_PEERCRED),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_PEERCRED),
 			},
 		},
 	}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
index 5335ff82c..cea5613b8 100644
--- a/runsc/boot/filter/config_amd64.go
+++ b/runsc/boot/filter/config_amd64.go
@@ -24,8 +24,41 @@ import (
 )
 
 func init() {
-	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
-		seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	)
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+		// TODO(b/168828518): No longer used in Go 1.16+.
+		{seccomp.EqualTo(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		// parent_tidptr and child_tidptr are always 0 because neither
+		// CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SETTLS |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+		{
+			// TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+	}
 }
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
index 7fa9bbda3..37313f97f 100644
--- a/runsc/boot/filter/config_arm64.go
+++ b/runsc/boot/filter/config_arm64.go
@@ -16,6 +16,29 @@
 
 package filter
 
-// Reserve for future customization.
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
 func init() {
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			// These arguments are left uninitialized by the Go
+			// runtime, so they may be anything (and are unused by
+			// the host).
+			seccomp.MatchAny{}, // parent_tidptr
+			seccomp.MatchAny{}, // tls
+			seccomp.MatchAny{}, // child_tidptr
+		},
+	}
 }
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
index 194952a7b..7b8669595 100644
--- a/runsc/boot/filter/config_profile.go
+++ b/runsc/boot/filter/config_profile.go
@@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_OPENAT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
 			},
 		},
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e83584b82..ddf288456 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -29,10 +29,12 @@ import (
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
@@ -47,6 +49,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -65,7 +68,7 @@ const (
 // tmpfs has some extra supported options that we must pass through.
 var tmpfsAllowedData = []string{"mode", "uid", "gid"}
 
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
 	upperFlags := lowerFlags
 	upperFlags.ReadOnly = false
@@ -155,7 +158,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 }
 
 // p9MountData creates a slice of p9 mount data.
-func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
+func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
@@ -166,7 +169,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
 		// enablement.
 		opts = append(opts, "privateunixsocket=true")
 	}
-	if fa == FileAccessShared {
+	if fa == config.FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
 	}
 	return opts
@@ -251,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
 
 // addSubmountOverlay overlays the inode over a ramfs tree containing the given
 // paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
 	// Construct a ramfs tree of mount points. The contents never
 	// change, so this can be fully caching. There's no real
 	// filesystem backing this tree, so we set the filesystem to
@@ -261,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 	if err != nil {
 		return nil, fmt.Errorf("creating mount tree: %v", err)
 	}
-	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding mount overlay: %v", err)
 	}
@@ -280,7 +283,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
 	if conf.VFS2 {
 		return setupContainerVFS2(ctx, conf, mntr, procArgs)
 	}
@@ -318,14 +321,14 @@ func adjustDirentCache(k *kernel.Kernel) error {
 }
 
 type fdDispenser struct {
-	fds []int
+	fds []*fd.FD
 }
 
 func (f *fdDispenser) remove() int {
 	if f.empty() {
 		panic("fdDispenser out of fds")
 	}
-	rv := f.fds[0]
+	rv := f.fds[0].Release()
 	f.fds = f.fds[1:]
 	return rv
 }
@@ -390,6 +393,10 @@ type mountHint struct {
 	// root is the inode where the volume is mounted. For mounts with 'pod' share
 	// the volume is mounted once and then bind mounted inside the containers.
 	root *fs.Inode
+
+	// vfsMount is the master mount for the volume. For mounts with 'pod' share
+	// the master volume is bind mounted inside the containers.
+	vfsMount *vfs.Mount
 }
 
 func (m *mountHint) setField(key, val string) error {
@@ -447,27 +454,27 @@ func (m *mountHint) isSupported() bool {
 func (m *mountHint) checkCompatible(mount specs.Mount) error {
 	// Remove options that don't affect to mount's behavior.
 	masterOpts := filterUnsupportedOptions(m.mount)
-	slaveOpts := filterUnsupportedOptions(mount)
+	replicaOpts := filterUnsupportedOptions(mount)
 
-	if len(masterOpts) != len(slaveOpts) {
-		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	if len(masterOpts) != len(replicaOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 	}
 
 	sort.Strings(masterOpts)
-	sort.Strings(slaveOpts)
+	sort.Strings(replicaOpts)
 	for i, opt := range masterOpts {
-		if opt != slaveOpts[i] {
-			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		if opt != replicaOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 		}
 	}
 	return nil
 }
 
-func (m *mountHint) fileAccessType() FileAccessType {
+func (m *mountHint) fileAccessType() config.FileAccessType {
 	if m.share == container {
-		return FileAccessExclusive
+		return config.FileAccessExclusive
 	}
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 func filterUnsupportedOptions(mount specs.Mount) []string {
@@ -558,7 +565,7 @@ type containerMounter struct {
 	hints *podMountHints
 }
 
-func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter {
 	return &containerMounter{
 		root:   spec.Root,
 		mounts: compileMounts(spec),
@@ -571,9 +578,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // processHints processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHints(conf *Config) error {
+func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
 	if conf.VFS2 {
-		return nil
+		return c.processHintsVFS2(conf, creds)
 	}
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
@@ -595,7 +602,7 @@ func (c *containerMounter) processHints(conf *Config) error {
 // setupFS is used to set up the file system for all containers. This is the
 // main entry point method, with most of the other being internal only. It
 // returns the mount namespace that is created for the container.
-func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
 	log.Infof("Configuring container's file system")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -621,7 +628,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) {
 	rootInode, err := c.createRootMount(ctx, conf)
 	if err != nil {
 		return nil, fmt.Errorf("creating filesystem for container: %v", err)
@@ -633,9 +640,9 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error {
 	root := mns.Root()
-	defer root.DecRef()
+	defer root.DecRef(ctx)
 
 	for _, m := range c.mounts {
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
@@ -669,7 +676,7 @@ func (c *containerMounter) checkDispenser() error {
 
 // mountSharedMaster mounts the master of a volume that is shared among
 // containers in a pod. It returns the root mount's inode.
-func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
@@ -709,7 +716,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config,
 }
 
 // createRootMount creates the root filesystem.
-func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
 	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
 
@@ -734,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
 	// mounted even if they are not in the spec.
 	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding submount overlay: %v", err)
 	}
@@ -754,7 +761,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 
 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) {
 	var (
 		fsName     string
 		opts       []string
@@ -788,19 +795,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, nil
 }
 
-func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType {
 	if hint := c.hints.findMount(mount); hint != nil {
 		return hint.fileAccessType()
 	}
 	// Non-root bind mounts are always shared if no hints were provided.
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
 // 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
@@ -844,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 	submounts := subtargets(m.Destination, c.mounts)
 	if len(submounts) > 0 {
 		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
 		if err != nil {
 			return fmt.Errorf("adding submount overlay: %v", err)
 		}
@@ -863,7 +870,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 	if err != nil {
 		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
 	}
-	defer dirent.DecRef()
+	defer dirent.DecRef(ctx)
 	if err := mns.Mount(ctx, dirent, inode); err != nil {
 		return fmt.Errorf("mount %q error: %v", m.Destination, err)
 	}
@@ -884,12 +891,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 	if err != nil {
 		return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
 	}
-	defer target.DecRef()
+	defer target.DecRef(ctx)
 
 	// Take a ref on the inode that is about to be (re)-mounted.
 	source.root.IncRef()
 	if err := mns.Mount(ctx, target, source.root); err != nil {
-		source.root.DecRef()
+		source.root.DecRef(ctx)
 		return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
 	}
 
@@ -899,7 +906,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
-func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
 	if err != nil {
 		return err
@@ -924,7 +931,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
 
 // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
 // the mounts to the environment.
-func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) {
 	renv := &fs.RestoreEnvironment{
 		MountSources: make(map[string][]fs.MountArgs),
 	}
@@ -979,7 +986,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error {
 	for _, m := range c.mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
@@ -992,12 +999,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
 	switch err {
 	case nil:
 		// Found '/tmp' in filesystem, check if it's empty.
-		defer tmp.DecRef()
+		defer tmp.DecRef(ctx)
 		f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
 		if err != nil {
 			return err
 		}
-		defer f.DecRef()
+		defer f.DecRef(ctx)
 		serializer := &fs.CollectEntriesSerializer{}
 		if err := f.Readdir(ctx, serializer); err != nil {
 			return err
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 912037075..e986231e5 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -20,6 +20,7 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func TestPodMountHintsHappy(t *testing.T) {
@@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) {
 	for _, tst := range []struct {
 		name        string
 		annotations map[string]string
-		want        FileAccessType
+		want        config.FileAccessType
 	}{
 		{
 			name: "container=exclusive",
@@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessExclusive,
+			want: config.FileAccessExclusive,
 		},
 		{
 			name: "pod=shared",
@@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "pod",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "shared=shared",
@@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "shared",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "default=shared",
@@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 	} {
 		t.Run(tst.name, func(t *testing.T) {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index b5df1deb9..dee2c4fbb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,22 +16,25 @@
 package boot
 
 import (
+	"errors"
 	"fmt"
 	mrand "math/rand"
 	"os"
 	"runtime"
 	"sync/atomic"
-	"syscall"
 	gtime "time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
@@ -66,7 +69,9 @@ import (
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/specutils/seccomp"
 
 	// Include supported socket providers.
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
@@ -77,6 +82,22 @@ import (
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
 
+type containerInfo struct {
+	conf *config.Config
+
+	// spec is the base configuration for the root container.
+	spec *specs.Spec
+
+	// procArgs refers to the container's init task.
+	procArgs kernel.CreateProcessArgs
+
+	// stdioFDs contains stdin, stdout, and stderr.
+	stdioFDs []*fd.FD
+
+	// goferFDs are the FDs that attach the sandbox to the gofers.
+	goferFDs []*fd.FD
+}
+
 // Loader keeps state needed to start the kernel and run the container..
 type Loader struct {
 	// k is the kernel.
@@ -85,22 +106,11 @@ type Loader struct {
 	// ctrl is the control server.
 	ctrl *controller
 
-	conf *Config
-
-	// console is set to true if terminal is enabled.
-	console bool
+	// root contains information about the root container in the sandbox.
+	root containerInfo
 
 	watchdog *watchdog.Watchdog
 
-	// stdioFDs contains stdin, stdout, and stderr.
-	stdioFDs []int
-
-	// goferFDs are the FDs that attach the sandbox to the gofers.
-	goferFDs []int
-
-	// spec is the base configuration for the root container.
-	spec *specs.Spec
-
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
@@ -108,9 +118,6 @@ type Loader struct {
 	// restore is set to true if we are restoring a container.
 	restore bool
 
-	// rootProcArgs refers to the root sandbox init task.
-	rootProcArgs kernel.CreateProcessArgs
-
 	// sandboxID is the ID for the whole sandbox.
 	sandboxID string
 
@@ -162,7 +169,7 @@ type Args struct {
 	// Spec is the sandbox specification.
 	Spec *specs.Spec
 	// Conf is the system configuration.
-	Conf *Config
+	Conf *config.Config
 	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
 	// of this FD and may close it at any time.
 	ControllerFD int
@@ -175,8 +182,6 @@ type Args struct {
 	// StdioFDs is the stdio for the application. The Loader takes ownership of
 	// these FDs and may close them at any time.
 	StdioFDs []int
-	// Console is set to true if using TTY.
-	Console bool
 	// NumCPU is the number of CPUs to create inside the sandbox.
 	NumCPU int
 	// TotalMem is the initial amount of total memory to report back to the
@@ -187,7 +192,7 @@ type Args struct {
 }
 
 // make sure stdioFDs are always the same on initial start and on restore
-const startingStdioFD = 64
+const startingStdioFD = 256
 
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
@@ -205,6 +210,10 @@ func New(args Args) (*Loader, error) {
 	// Is this a VFSv2 kernel?
 	if args.Conf.VFS2 {
 		kernel.VFS2Enabled = true
+		if args.Conf.FUSE {
+			kernel.FUSEEnabled = true
+		}
+
 		vfs2.Override()
 	}
 
@@ -227,9 +236,7 @@ func New(args Args) (*Loader, error) {
 	// Create VDSO.
 	//
 	// Pass k as the platform since it is savable, unlike the actual platform.
-	//
-	// FIXME(b/109889800): Use non-nil context.
-	vdso, err := loader.PrepareVDSO(nil, k)
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("creating vdso: %v", err)
 	}
@@ -275,6 +282,7 @@ func New(args Args) (*Loader, error) {
 		args.NumCPU = runtime.NumCPU()
 	}
 	log.Infof("CPUs: %d", args.NumCPU)
+	runtime.GOMAXPROCS(args.NumCPU)
 
 	if args.TotalMem > 0 {
 		// Adjust the total memory returned by the Sentry so that applications that
@@ -300,6 +308,12 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
+	if kernel.VFS2Enabled {
+		if err := registerFilesystems(k); err != nil {
+			return nil, fmt.Errorf("registering filesystems: %w", err)
+		}
+	}
+
 	if err := adjustDirentCache(k); err != nil {
 		return nil, err
 	}
@@ -318,7 +332,7 @@ func New(args Args) (*Loader, error) {
 	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
 	dog := watchdog.New(k, dogOpts)
 
-	procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+	procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
 	if err != nil {
 		return nil, fmt.Errorf("creating init process for root container: %v", err)
 	}
@@ -338,7 +352,7 @@ func New(args Args) (*Loader, error) {
 		if err != nil {
 			return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
 		}
-		defer hostFilesystem.DecRef()
+		defer hostFilesystem.DecRef(k.SupervisorContext())
 		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
 		if err != nil {
 			return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
@@ -346,37 +360,45 @@ func New(args Args) (*Loader, error) {
 		k.SetHostMount(hostMount)
 	}
 
+	info := containerInfo{
+		conf:     args.Conf,
+		spec:     args.Spec,
+		procArgs: procArgs,
+	}
+
 	// Make host FDs stable between invocations. Host FDs must map to the exact
 	// same number when the sandbox is restored. Otherwise the wrong FD will be
 	// used.
-	var stdioFDs []int
 	newfd := startingStdioFD
-	for _, fd := range args.StdioFDs {
-		err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
-		if err != nil {
-			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+	for _, stdioFD := range args.StdioFDs {
+		// Check that newfd is unused to avoid clobbering over it.
+		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+			if err != nil {
+				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+			}
+			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
 		}
-		stdioFDs = append(stdioFDs, newfd)
-		err = syscall.Close(fd)
+
+		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
 		if err != nil {
-			return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
 		}
+		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+		_ = unix.Close(stdioFD)
 		newfd++
 	}
+	for _, goferFD := range args.GoferFDs {
+		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+	}
 
 	eid := execID{cid: args.ID}
 	l := &Loader{
-		k:            k,
-		conf:         args.Conf,
-		console:      args.Console,
-		watchdog:     dog,
-		spec:         args.Spec,
-		goferFDs:     args.GoferFDs,
-		stdioFDs:     stdioFDs,
-		rootProcArgs: procArgs,
-		sandboxID:    args.ID,
-		processes:    map[execID]*execProcess{eid: {}},
-		mountHints:   mountHints,
+		k:          k,
+		watchdog:   dog,
+		sandboxID:  args.ID,
+		processes:  map[execID]*execProcess{eid: {}},
+		mountHints: mountHints,
+		root:       info,
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -404,8 +426,8 @@ func New(args Args) (*Loader, error) {
 	return l, nil
 }
 
-// newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+// createProcessArgs creates args that can be used with kernel.CreateProcess.
+func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
@@ -449,9 +471,19 @@ func (l *Loader) Destroy() {
 		l.stopSignalForwarding()
 	}
 	l.watchdog.Stop()
+
+	// In the success case, stdioFDs and goferFDs will only contain
+	// released/closed FDs that ownership has been passed over to host FDs and
+	// gofer sessions. Close them here in case on failure.
+	for _, fd := range l.root.stdioFDs {
+		_ = fd.Close()
+	}
+	for _, fd := range l.root.goferFDs {
+		_ = fd.Close()
+	}
 }
 
-func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
 	p, err := platform.Lookup(conf.Platform)
 	if err != nil {
 		panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
@@ -478,14 +510,15 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	return mf, nil
 }
 
+// installSeccompFilters installs sandbox seccomp filters with the host.
 func (l *Loader) installSeccompFilters() error {
-	if l.conf.DisableSeccomp {
+	if l.root.conf.DisableSeccomp {
 		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
 	} else {
 		opts := filter.Options{
 			Platform:      l.k.Platform,
-			HostNetwork:   l.conf.Network == NetworkHost,
-			ProfileEnable: l.conf.ProfileEnable,
+			HostNetwork:   l.root.conf.Network == config.NetworkHost,
+			ProfileEnable: l.root.conf.ProfileEnable,
 			ControllerFD:  l.ctrl.srv.FD(),
 		}
 		if err := filter.Install(opts); err != nil {
@@ -511,7 +544,7 @@ func (l *Loader) Run() error {
 }
 
 func (l *Loader) run() error {
-	if l.conf.Network == NetworkHost {
+	if l.root.conf.Network == config.NetworkHost {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
@@ -532,10 +565,8 @@ func (l *Loader) run() error {
 
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
-	var ttyFile *host.TTYFileOperations
-	var ttyFileVFS2 *hostvfs2.TTYFileDescription
 	if !l.restore {
-		if l.conf.ProfileEnable {
+		if l.root.conf.ProfileEnable {
 			pprof.Initialize()
 		}
 
@@ -545,82 +576,30 @@ func (l *Loader) run() error {
 			return err
 		}
 
-		// Create the FD map, which will set stdin, stdout, and stderr.  If console
-		// is true, then ioctl calls will be passed through to the host fd.
-		ctx := l.rootProcArgs.NewContext(l.k)
-		var err error
-
-		// CreateProcess takes a reference on FDMap if successful. We won't need
-		// ours either way.
-		l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
-		if err != nil {
-			return fmt.Errorf("importing fds: %v", err)
-		}
-
-		// Setup the root container file system.
-		l.startGoferMonitor(l.sandboxID, l.goferFDs)
-
-		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
-		if err := mntr.processHints(l.conf); err != nil {
-			return err
-		}
-		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
-			return err
-		}
-
-		// Add the HOME enviroment variable if it is not already set.
-		var envv []string
-		if kernel.VFS2Enabled {
-			envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
-				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-
-		} else {
-			envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
-				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-		}
-		if err != nil {
-			return err
-		}
-		l.rootProcArgs.Envv = envv
-
 		// Create the root container init task. It will begin running
 		// when the kernel is started.
-		if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
-			return fmt.Errorf("creating init process: %v", err)
+		if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+			return err
 		}
 
-		// CreateProcess takes a reference on FDTable if successful.
-		l.rootProcArgs.FDTable.DecRef()
 	}
 
 	ep.tg = l.k.GlobalInit()
-	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
 		ep.pidnsPath = ns.Path
 	}
-	if l.console {
-		// Set the foreground process group on the TTY to the global init process
-		// group, since that is what we are about to start running.
-		switch {
-		case ttyFileVFS2 != nil:
-			ep.ttyVFS2 = ttyFileVFS2
-			ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
-		case ttyFile != nil:
-			ep.tty = ttyFile
-			ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
-		}
-	}
 
 	// Handle signals by forwarding them to the root container process
 	// (except for panic signal, which should cause a panic).
 	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
 		// Panic signal should cause a panic.
-		if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+		if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
 			panic("Signal-induced panic")
 		}
 
 		// Otherwise forward to root container.
 		deliveryMode := DeliverToProcess
-		if l.console {
+		if l.root.spec.Process.Terminal {
 			// Since we are running with a console, we should forward the signal to
 			// the foreground process group so that job control signals like ^C can
 			// be handled properly.
@@ -632,19 +611,6 @@ func (l *Loader) run() error {
 		}
 	})
 
-	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
-	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
-	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
-	// passed FDs, so only close for VFS1.
-	if !kernel.VFS2Enabled {
-		for _, fd := range l.stdioFDs {
-			err := syscall.Close(fd)
-			if err != nil {
-				return fmt.Errorf("close dup()ed stdioFDs: %v", err)
-			}
-		}
-	}
-
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
@@ -664,9 +630,9 @@ func (l *Loader) createContainer(cid string) error {
 }
 
 // startContainer starts a child container. It returns the thread group ID of
-// the newly created process. Caller owns 'files' and may close them after
-// this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+// the newly created process. Used FDs are either closed or released. It's safe
+// for the caller to close any remaining files upon return.
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -676,8 +642,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	eid := execID{cid: cid}
-	if _, ok := l.processes[eid]; !ok {
+	ep := l.processes[execID{cid: cid}]
+	if ep == nil {
 		return fmt.Errorf("trying to start a deleted container %q", cid)
 	}
 
@@ -711,88 +677,136 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 		if pidns == nil {
 			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
 		}
-		l.processes[eid].pidnsPath = ns.Path
+		ep.pidnsPath = ns.Path
 	} else {
 		pidns = l.k.RootPIDNamespace()
 	}
-	procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+
+	info := &containerInfo{
+		conf:     conf,
+		spec:     spec,
+		stdioFDs: files[:3],
+		goferFDs: files[3:],
+	}
+	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
 	if err != nil {
 		return fmt.Errorf("creating new process: %v", err)
 	}
+	tg, err := l.createContainerProcess(false, cid, info, ep)
+	if err != nil {
+		return err
+	}
+
+	// Success!
+	l.k.StartProcess(tg)
+	ep.tg = tg
+	return nil
+}
 
-	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
-	var stdioFDs []int
-	for _, f := range files[:3] {
-		stdioFDs = append(stdioFDs, int(f.Fd()))
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
+	console := false
+	if root {
+		// Only root container supports terminal for now.
+		console = info.spec.Process.Terminal
 	}
 
 	// Create the FD map, which will set stdin, stdout, and stderr.
-	ctx := procArgs.NewContext(l.k)
-	fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
+	ctx := info.procArgs.NewContext(l.k)
+	fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
 	if err != nil {
-		return fmt.Errorf("importing fds: %v", err)
-	}
-	// CreateProcess takes a reference on fdTable if successful. We won't
-	// need ours either way.
-	procArgs.FDTable = fdTable
-
-	// Can't take ownership away from os.File. dup them to get a new FDs.
-	var goferFDs []int
-	for _, f := range files[3:] {
-		fd, err := syscall.Dup(int(f.Fd()))
-		if err != nil {
-			return fmt.Errorf("failed to dup file: %v", err)
-		}
-		goferFDs = append(goferFDs, fd)
+		return nil, fmt.Errorf("importing fds: %v", err)
 	}
+	// CreateProcess takes a reference on fdTable if successful. We won't need
+	// ours either way.
+	info.procArgs.FDTable = fdTable
 
 	// Setup the child container file system.
-	l.startGoferMonitor(cid, goferFDs)
+	l.startGoferMonitor(cid, info.goferFDs)
 
-	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
-	if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
-		return err
+	mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
+	if root {
+		if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
+			return nil, err
+		}
+	}
+	if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
+		return nil, err
 	}
 
 	// Add the HOME enviroment variable if it is not already set.
 	var envv []string
 	if kernel.VFS2Enabled {
-		envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2,
-			procArgs.Credentials.RealKUID, procArgs.Envv)
+		envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
+			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
 
 	} else {
-		envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace,
-			procArgs.Credentials.RealKUID, procArgs.Envv)
+		envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
+			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
 	}
 	if err != nil {
-		return err
+		return nil, err
 	}
-	procArgs.Envv = envv
+	info.procArgs.Envv = envv
 
 	// Create and start the new process.
-	tg, _, err := l.k.CreateProcess(procArgs)
+	tg, _, err := l.k.CreateProcess(info.procArgs)
 	if err != nil {
-		return fmt.Errorf("creating process: %v", err)
+		return nil, fmt.Errorf("creating process: %v", err)
 	}
-	l.k.StartProcess(tg)
-
 	// CreateProcess takes a reference on FDTable if successful.
-	procArgs.FDTable.DecRef()
+	info.procArgs.FDTable.DecRef(ctx)
 
-	l.processes[eid].tg = tg
-	return nil
+	// Set the foreground process group on the TTY to the global init process
+	// group, since that is what we are about to start running.
+	if root {
+		switch {
+		case ttyFileVFS2 != nil:
+			ep.ttyVFS2 = ttyFileVFS2
+			ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+		case ttyFile != nil:
+			ep.tty = ttyFile
+			ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
+		}
+	}
+
+	// Install seccomp filters with the new task if there are any.
+	if info.conf.OCISeccomp {
+		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
+			if err != nil {
+				return nil, fmt.Errorf("building seccomp program: %v", err)
+			}
+
+			if log.IsLogging(log.Debug) {
+				out, _ := bpf.DecodeProgram(program)
+				log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
+			}
+
+			task := tg.Leader()
+			// NOTE: It seems Flags are ignored by runc so we ignore them too.
+			if err := task.AppendSyscallFilter(program, true); err != nil {
+				return nil, fmt.Errorf("appending seccomp filters: %v", err)
+			}
+		}
+	} else {
+		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+			log.Warningf("Seccomp spec is being ignored")
+		}
+	}
+
+	return tg, nil
 }
 
 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and destroys the container if a
+// the gofer FDs looking for disconnects, and kills the container processes if a
 // disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
 	go func() {
 		log.Debugf("Monitoring gofer health for container %q", cid)
 		var events []unix.PollFd
-		for _, fd := range goferFDs {
+		for _, goferFD := range goferFDs {
 			events = append(events, unix.PollFd{
-				Fd:     int32(fd),
+				Fd:     int32(goferFD.FD()),
 				Events: unix.POLLHUP | unix.POLLRDHUP,
 			})
 		}
@@ -805,18 +819,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
 			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
 		}
 
-		// Check if the gofer has stopped as part of normal container destruction.
-		// This is done just to avoid sending an annoying error message to the log.
-		// Note that there is a small race window in between mu.Unlock() and the
-		// lock being reacquired in destroyContainer(), but it's harmless to call
-		// destroyContainer() multiple times.
 		l.mu.Lock()
-		_, ok := l.processes[execID{cid: cid}]
-		l.mu.Unlock()
-		if ok {
-			log.Infof("Gofer socket disconnected, destroying container %q", cid)
-			if err := l.destroyContainer(cid); err != nil {
-				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+		defer l.mu.Unlock()
+
+		// The gofer could have been stopped due to a normal container shutdown.
+		// Check if the container has not stopped yet.
+		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
+			log.Infof("Gofer socket disconnected, killing container %q", cid)
+			if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+				log.Warningf("Error killing container %q after gofer stopped: %v", cid, err)
 			}
 		}
 	}()
@@ -885,37 +896,42 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
-	// Get the container MountNamespace from the Task.
+	// Get the container MountNamespace from the Task. Try to acquire ref may fail
+	// in case it raced with task exit.
 	if kernel.VFS2Enabled {
 		// task.MountNamespace() does not take a ref, so we must do so ourselves.
 		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
-		args.MountNamespaceVFS2.IncRef()
+		if !args.MountNamespaceVFS2.TryIncRef() {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	} else {
+		var reffed bool
 		tg.Leader().WithMuLocked(func(t *kernel.Task) {
 			// task.MountNamespace() does not take a ref, so we must do so ourselves.
 			args.MountNamespace = t.MountNamespace()
-			args.MountNamespace.IncRef()
+			reffed = args.MountNamespace.TryIncRef()
 		})
+		if !reffed {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	}
 
 	// Add the HOME environment variable if it is not already set.
 	if kernel.VFS2Enabled {
-		defer args.MountNamespaceVFS2.DecRef()
-
 		root := args.MountNamespaceVFS2.Root()
-		defer root.DecRef()
 		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+		defer args.MountNamespaceVFS2.DecRef(ctx)
+		defer root.DecRef(ctx)
 		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
 		if err != nil {
 			return 0, err
 		}
 		args.Envv = envv
 	} else {
-		defer args.MountNamespace.DecRef()
-
 		root := args.MountNamespace.Root()
-		defer root.DecRef()
 		ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+		defer args.MountNamespace.DecRef(ctx)
+		defer root.DecRef(ctx)
 		envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
 		if err != nil {
 			return 0, err
@@ -1012,20 +1028,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	// Wait for container.
 	l.k.WaitExited()
 
+	// Cleanup
+	l.ctrl.stop()
+
+	refs.OnExit()
+
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
 	// Create an empty network stack because the network namespace may be empty at
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
 	switch conf.Network {
-	case NetworkHost:
+	case config.NetworkHost:
 		// No network namespacing support for hostinet yet, hence creator is nil.
 		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
-	case NetworkNone, NetworkSandbox:
+	case config.NetworkNone, config.NetworkSandbox:
 		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
 		if err != nil {
 			return nil, err
@@ -1043,8 +1064,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni
 }
 
 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
-	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
+	transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
 	s := netstack.Stack{stack.New(stack.Options{
 		NetworkProtocols:   netProtos,
 		TransportProtocols: transProtos,
@@ -1058,17 +1079,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
 	})}
 
 	// Enable SACK Recovery.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-		return nil, fmt.Errorf("failed to enable SACK: %s", err)
+	{
+		opt := tcpip.TCPSACKEnabled(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Set default TTLs as required by socket/netstack.
-	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	{
+		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+		}
+		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+		}
+	}
 
 	// Enable Receive Buffer Auto-Tuning.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	return &s, nil
@@ -1264,7 +1298,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
 	return ep.tty, ep.ttyVFS2, nil
 }
 
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	if len(stdioFDs) != 3 {
 		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
@@ -1273,7 +1307,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
 	fdTable := k.NewFDTable()
 	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
 	if err != nil {
-		fdTable.DecRef()
+		fdTable.DecRef(ctx)
 		return nil, nil, nil, err
 	}
 	return fdTable, ttyFile, ttyFileVFS2, nil
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e448fd773..1f49431a2 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,6 +26,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
@@ -34,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 )
 
@@ -43,15 +45,19 @@ func init() {
 	if err := fsgofer.OpenProcSelfFD(); err != nil {
 		panic(err)
 	}
+	config.RegisterFlags()
 }
 
-func testConfig() *Config {
-	return &Config{
-		RootDir:        "unused_root_dir",
-		Network:        NetworkNone,
-		DisableSeccomp: true,
-		Platform:       "ptrace",
+func testConfig() *config.Config {
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		panic(err)
 	}
+	// Change test defaults.
+	conf.RootDir = "unused_root_dir"
+	conf.Network = config.NetworkNone
+	conf.DisableSeccomp = true
+	return conf
 }
 
 // testSpec returns a simple spec that can be used in tests.
@@ -258,7 +264,7 @@ type CreateMountTestcase struct {
 	expectedPaths []string
 }
 
-func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+func createMountTestcases() []*CreateMountTestcase {
 	testCases := []*CreateMountTestcase{
 		&CreateMountTestcase{
 			// Only proc.
@@ -403,32 +409,26 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 					Destination: "/proc",
 					Type:        "tmpfs",
 				},
-				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
-				//  MkDirAt in VFS2 (and remove the reduntant append).
-				// {
-				//		Destination: "/sys/bar",
-				//		Type:        "tmpfs",
-				//	},
-				//
+				{
+					Destination: "/sys/bar",
+					Type:        "tmpfs",
+				},
+
 				{
 					Destination: "/tmp/baz",
 					Type:        "tmpfs",
 				},
 			},
 		},
-		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
+		expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
 	}
 
-	if !vfs2 {
-		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
-		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
-	}
 	return append(testCases, vfsCase)
 }
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespace(t *testing.T) {
-	for _, tc := range createMountTestcases(false /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
@@ -439,7 +439,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+			mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{})
 			mns, err := mntr.createMountNamespace(ctx, conf)
 			if err != nil {
 				t.Fatalf("failed to create mount namespace: %v", err)
@@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 
 			root := mns.Root()
-			defer root.DecRef()
+			defer root.DecRef(ctx)
 			for _, p := range tc.expectedPaths {
 				maxTraversals := uint(0)
 				if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
 				} else {
-					d.DecRef()
+					d.DecRef(ctx)
 				}
 			}
 		})
@@ -465,7 +465,7 @@ func TestCreateMountNamespace(t *testing.T) {
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespaceVFS2(t *testing.T) {
-	for _, tc := range createMountTestcases(true /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			spec := testSpec()
 			spec.Mounts = tc.spec.Mounts
@@ -479,19 +479,19 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			defer l.Destroy()
 			defer loaderCleanup()
 
-			mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
-			if err := mntr.processHints(l.conf); err != nil {
+			mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints)
+			if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil {
 				t.Fatalf("failed process hints: %v", err)
 			}
 
 			ctx := l.k.SupervisorContext()
-			mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+			mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
 			if err != nil {
-				t.Fatalf("failed to setupVFS2: %v", err)
+				t.Fatalf("mountAll: %v", err)
 			}
 
 			root := mns.Root()
-			defer root.DecRef()
+			defer root.DecRef(ctx)
 			for _, p := range tc.expectedPaths {
 				target := &vfs.PathOperation{
 					Root:  root,
@@ -499,10 +499,10 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 					Path:  fspath.Parse(p),
 				}
 
-				if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+				if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
 				} else {
-					d.DecRef()
+					d.DecRef(ctx)
 				}
 			}
 		})
@@ -545,7 +545,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -599,7 +599,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 						{
 							Dev:        "9pfs-/dev/fd-foo",
@@ -657,7 +657,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -697,7 +697,11 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+			var ioFDs []*fd.FD
+			for _, ioFD := range tc.ioFDs {
+				ioFDs = append(ioFDs, fd.New(ioFD))
+			}
+			mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{})
 			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 14d2f56a5..988573640 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
 	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -32,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 var (
@@ -77,44 +79,6 @@ type DefaultRoute struct {
 	Name  string
 }
 
-// QueueingDiscipline is used to specify the kind of Queueing Discipline to
-// apply for a give FDBasedLink.
-type QueueingDiscipline int
-
-const (
-	// QDiscNone disables any queueing for the underlying FD.
-	QDiscNone QueueingDiscipline = iota
-
-	// QDiscFIFO applies a simple fifo based queue to the underlying
-	// FD.
-	QDiscFIFO
-)
-
-// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
-// else returns an error.
-func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
-	switch s {
-	case "none":
-		return QDiscNone, nil
-	case "fifo":
-		return QDiscFIFO, nil
-	default:
-		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
-	}
-}
-
-// String implements fmt.Stringer.
-func (q QueueingDiscipline) String() string {
-	switch q {
-	case QDiscNone:
-		return "none"
-	case QDiscFIFO:
-		return "fifo"
-	default:
-		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
-	}
-}
-
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
 	Name               string
@@ -126,7 +90,7 @@ type FDBasedLink struct {
 	TXChecksumOffload  bool
 	RXChecksumOffload  bool
 	LinkAddress        net.HardwareAddr
-	QDisc              QueueingDiscipline
+	QDisc              config.QueueingDiscipline
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -246,12 +210,15 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		switch link.QDisc {
-		case QDiscNone:
-		case QDiscFIFO:
+		case config.QDiscNone:
+		case config.QDiscFIFO:
 			log.Infof("Enabling FIFO QDisc on %q", link.Name)
 			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
 		}
 
+		// Enable support for AF_PACKET sockets to receive outgoing packets.
+		linkEP = packetsocket.New(linkEP)
+
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
 		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index fbfd3b07c..c21648a32 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -15,10 +15,13 @@
 package boot
 
 import (
+	"strings"
+
 	"gvisor.dev/gvisor/pkg/sentry/strace"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
-func enableStrace(conf *Config) error {
+func enableStrace(conf *config.Config) error {
 	// We must initialize even if strace is not enabled.
 	strace.Initialize()
 
@@ -36,5 +39,5 @@ func enableStrace(conf *Config) error {
 		strace.EnableAll(strace.SinkTypeLog)
 		return nil
 	}
-	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+	return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog)
 }
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index b68117867..e36664938 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,12 +16,12 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"sort"
 	"strings"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
@@ -37,13 +37,19 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
-func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+func registerFilesystems(k *kernel.Kernel) error {
+	ctx := k.SupervisorContext()
+	creds := auth.NewRootCredentials(k.RootUserNamespace())
+	vfsObj := k.VFS()
+
 	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserList: true,
 		// TODO(b/29356795): Users may mount this once the terminals are in a
@@ -73,6 +79,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
 		AllowUserMount: true,
 		AllowUserList:  true,
 	})
+	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
 
 	// Setup files in devtmpfs.
 	if err := memdev.Register(vfsObj); err != nil {
@@ -81,18 +91,24 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
 	if err := ttydev.Register(vfsObj); err != nil {
 		return fmt.Errorf("registering ttydev: %w", err)
 	}
-
-	if err := fuse.Register(vfsObj); err != nil {
-		return fmt.Errorf("registering fusedev: %w", err)
+	tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
+	if tunSupported {
+		if err := tundev.Register(vfsObj); err != nil {
+			return fmt.Errorf("registering tundev: %v", err)
+		}
 	}
-	if err := tundev.Register(vfsObj); err != nil {
-		return fmt.Errorf("registering tundev: %v", err)
+
+	if kernel.FUSEEnabled {
+		if err := fuse.Register(vfsObj); err != nil {
+			return fmt.Errorf("registering fusedev: %w", err)
+		}
 	}
+
 	a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
 	if err != nil {
 		return fmt.Errorf("creating devtmpfs accessor: %w", err)
 	}
-	defer a.Release()
+	defer a.Release(ctx)
 
 	if err := a.UserspaceInit(ctx); err != nil {
 		return fmt.Errorf("initializing userspace: %w", err)
@@ -103,20 +119,23 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
 	if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
 		return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
 	}
-	if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
-		return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+	if tunSupported {
+		if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
+			return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+		}
 	}
-	if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
-		return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+
+	if kernel.FUSEEnabled {
+		if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
+			return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+		}
 	}
+
 	return nil
 }
 
-func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
-	if err := mntr.k.VFS().Init(); err != nil {
-		return fmt.Errorf("failed to initialize VFS: %w", err)
-	}
-	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	mns, err := mntr.mountAll(conf, procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to setupFS: %w", err)
 	}
@@ -131,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte
 	return nil
 }
 
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
 	log.Infof("Configuring container's file system with VFS2")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -144,36 +163,115 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
 	rootCtx := procArgs.NewContext(c.k)
 
-	if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
-		return nil, fmt.Errorf("register filesystems: %w", err)
-	}
-
 	mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
 	if err != nil {
 		return nil, fmt.Errorf("creating mount namespace: %w", err)
 	}
 	rootProcArgs.MountNamespaceVFS2 = mns
 
+	root := mns.Root()
+	defer root.DecRef(rootCtx)
+	if root.Mount().ReadOnly() {
+		// Switch to ReadWrite while we setup submounts.
+		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
+			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
+		}
+		// Restore back to ReadOnly at the end.
+		defer func() {
+			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
+			}
+		}()
+	}
+
 	// Mount submounts.
 	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
 		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
 	}
+
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
 	fd := c.fds.remove()
-	opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
+	data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+
+	if conf.OverlayfsStaleRead {
+		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
+		// can only send mount options for specs.Mounts (specs.Root is missing
+		// Options field). So assume root is always on top of overlayfs.
+		data = append(data, "overlayfs_stale_read")
+	}
 
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
+	opts := &vfs.MountOptions{
+		ReadOnly: c.root.Readonly,
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+		},
+		InternalMount: true,
+	}
+
+	fsName := gofer.Name
+	if conf.Overlay && !c.root.Readonly {
+		log.Infof("Adding overlay on top of root")
+		var err error
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting root with overlay: %w", err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
 	if err != nil {
 		return nil, fmt.Errorf("setting up mount namespace: %w", err)
 	}
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
+// layer using tmpfs, and return overlay mount options. "cleanup" must be called
+// after the options have been used to mount the overlay, to release refs on
+// lower and upper mounts.
+func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
+	// First copy options from lower layer to upper layer and overlay. Clear
+	// filesystem specific options.
+	upperOpts := *lowerOpts
+	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+	overlayOpts := *lowerOpts
+	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+	// Next mount upper and lower. Upper is a tmpfs mount to keep all
+	// modifications inside the sandbox.
+	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+	}
+	cu := cleanup.Make(func() { upper.DecRef(ctx) })
+	defer cu.Clean()
+
+	// All writes go to the upper layer, be paranoid and make lower readonly.
+	lowerOpts.ReadOnly = true
+	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
+	if err != nil {
+		return nil, nil, err
+	}
+	cu.Add(func() { lower.DecRef(ctx) })
+
+	// Configure overlay with both layers.
+	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
+		UpperRoot:  vfs.MakeVirtualDentry(upper, upper.Root()),
+		LowerRoots: []vfs.VirtualDentry{vfs.MakeVirtualDentry(lower, lower.Root())},
+	}
+	return &overlayOpts, cu.Release(), nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
 	mounts, err := c.prepareMountsVFS2()
 	if err != nil {
 		return err
@@ -182,8 +280,34 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
 	for i := range mounts {
 		submount := &mounts[i]
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
-		if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
-			return err
+		var (
+			mnt *vfs.Mount
+			err error
+		)
+
+		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+			mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
+			if err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+			}
+		} else {
+			mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
+			if err != nil {
+				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+			}
+		}
+
+		if mnt != nil && mnt.ReadOnly() {
+			// Switch to ReadWrite while we setup submounts.
+			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
+				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
+			}
+			// Restore back to ReadOnly at the end.
+			defer func() {
+				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
+					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
+				}
+			}()
 		}
 	}
 
@@ -227,62 +351,83 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 	return mounts, nil
 }
 
-func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
-	root := mns.Root()
-	defer root.DecRef()
-	target := &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(submount.Destination),
-	}
-	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
 	if err != nil {
-		return fmt.Errorf("mountOptions failed: %w", err)
+		return nil, fmt.Errorf("mountOptions failed: %w", err)
 	}
 	if len(fsName) == 0 {
 		// Filesystem is not supported (e.g. cgroup), just skip it.
-		return nil
+		return nil, nil
 	}
 
-	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
-		return err
+	if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
 	}
-	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
-		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+
+	if useOverlay {
+		log.Infof("Adding overlay on top of mount %q", submount.Destination)
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
+	root := mns.Root()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(submount.Destination),
+	}
+	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
+	if err != nil {
+		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
 	}
 	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
-	return nil
+	return mnt, nil
 }
 
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
-	var (
-		fsName string
-		data   []string
-	)
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
+	fsName := m.Type
+	useOverlay := false
+	var data []string
 
 	// Find filesystem name and FS specific data field.
 	switch m.Type {
 	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
-		fsName = m.Type
+		// Nothing to do.
+
 	case nonefs:
 		fsName = sys.Name
-	case tmpfs.Name:
-		fsName = m.Type
 
+	case tmpfs.Name:
 		var err error
 		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
-			return "", nil, err
+			return "", nil, false, err
 		}
 
 	case bind:
 		fsName = gofer.Name
+		if m.fd == 0 {
+			// Check that an FD was provided to fails fast. Technically FD=0 is valid,
+			// but unlikely to be correct in this context.
+			return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
+		}
 		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
 
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
 	default:
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+		return "", nil, false, nil
 	}
 
 	opts := &vfs.MountOptions{
@@ -307,38 +452,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		}
 	}
 
-	if conf.Overlay {
-		// All writes go to upper, be paranoid and make lower readonly.
-		opts.ReadOnly = true
-	}
-	return fsName, opts, nil
-}
-
-func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
-	target := &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(currentPath),
-	}
-	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
-	if err == nil {
-		// Mount point exists, nothing else to do.
-		return nil
-	}
-	if err != syserror.ENOENT {
-		return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
-	}
-
-	// Recurse to ensure parent is created and then create the mount point.
-	if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
-		return err
-	}
-	log.Debugf("Creating dir %q for mount point", currentPath)
-	mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
-	if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
-		return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
-	}
-	return nil
+	return fsName, opts, useOverlay, nil
 }
 
 // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
@@ -350,7 +464,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
 	for _, m := range c.mounts {
 		// m.Destination has been cleaned, so it's to use equality here.
 		if m.Destination == "/tmp" {
@@ -360,28 +474,35 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 	}
 
 	root := mns.Root()
-	defer root.DecRef()
+	defer root.DecRef(ctx)
 	pop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
 		Path:  fspath.Parse("/tmp"),
 	}
 	// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
-	statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{})
+	fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
 	switch err {
 	case nil:
-		// Found '/tmp' in filesystem, check if it's empty.
-		if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory {
-			// Not a dir?! Leave it be.
+		defer fd.DecRef(ctx)
+
+		err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name != "." && dirent.Name != ".." {
+				return syserror.ENOTEMPTY
+			}
 			return nil
-		}
-		if statx.Nlink > 2 {
+		}))
+		switch err {
+		case nil:
+			log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+		case syserror.ENOTEMPTY:
 			// If more than "." and ".." is found, skip internal tmpfs to prevent
 			// hiding existing files.
 			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
 			return nil
+		default:
+			return err
 		}
-		log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
 		fallthrough
 
 	case syserror.ENOENT:
@@ -394,9 +515,122 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=01777"},
 		}
-		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		return err
+
+	case syserror.ENOTDIR:
+		// Not a dir?! Let it be.
+		return nil
 
 	default:
-		return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
+		return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
+	}
+}
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs.Name {
+			continue
+		}
+
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.vfsMount = mnt
+	}
+	return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	mntFD := &mountAndFD{Mount: hint.mount}
+	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+
+	if useOverlay {
+		log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
+	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
+	if err := source.checkCompatible(mount); err != nil {
+		return nil, err
+	}
+
+	// Ignore data and useOverlay because these were already applied to
+	// the master mount.
+	_, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+	if err != nil {
+		return nil, err
+	}
+	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+	if err != nil {
+		return nil, err
+	}
+	defer newMnt.DecRef(ctx)
+
+	root := mns.Root()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(mount.Destination),
+	}
+
+	if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
+	}
+
+	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+		return nil, err
+	}
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return newMnt, nil
+}
+
+func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
+	root := mns.Root()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dest),
+	}
+	// First check if mount point exists. When overlay is enabled, gofer doesn't
+	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
+	// because MkdirAt fails with EROFS even if file exists.
+	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
+	if err == nil {
+		// File exists, we're done.
+		vd.DecRef(ctx)
+		return nil
 	}
+	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
 }