15 files changed, 838 insertions, 905 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 9f52438c2..b97dc3c47 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -8,7 +8,6 @@ go_library(
         "compat.go",
         "compat_amd64.go",
         "compat_arm64.go",
-        "config.go",
         "controller.go",
         "debug.go",
         "events.go",
@@ -27,15 +26,19 @@ go_library(
     deps = [
         "//pkg/abi",
         "//pkg/abi/linux",
+        "//pkg/bpf",
+        "//pkg/cleanup",
         "//pkg/context",
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
         "//pkg/rand",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/control",
@@ -105,9 +108,11 @@ go_library(
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
         "//runsc/boot/pprof",
+        "//runsc/config",
         "//runsc/specutils",
-        "@com_github_golang_protobuf//proto:go_default_library",
+        "//runsc/specutils/seccomp",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+        "@org_golang_google_protobuf//proto:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -123,6 +128,7 @@ go_test(
     library = ":boot",
     deps = [
         "//pkg/control/server",
+        "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
@@ -131,6 +137,7 @@ go_test(
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/unet",
+        "//runsc/config",
         "//runsc/fsgofer",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 84c67cbc2..7076ae2e2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -19,7 +19,7 @@ import (
 	"os"
 	"syscall"
 
-	"github.com/golang/protobuf/proto"
+	"google.golang.org/protobuf/proto"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
deleted file mode 100644
index 80da8b3e6..000000000
--- a/runsc/boot/config.go
+++ /dev/null
@@ -1,336 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"fmt"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/watchdog"
-)
-
-// FileAccessType tells how the filesystem is accessed.
-type FileAccessType int
-
-const (
-	// FileAccessShared sends IO requests to a Gofer process that validates the
-	// requests and forwards them to the host.
-	FileAccessShared FileAccessType = iota
-
-	// FileAccessExclusive is the same as FileAccessShared, but enables
-	// extra caching for improved performance. It should only be used if
-	// the sandbox has exclusive access to the filesystem.
-	FileAccessExclusive
-)
-
-// MakeFileAccessType converts type from string.
-func MakeFileAccessType(s string) (FileAccessType, error) {
-	switch s {
-	case "shared":
-		return FileAccessShared, nil
-	case "exclusive":
-		return FileAccessExclusive, nil
-	default:
-		return 0, fmt.Errorf("invalid file access type %q", s)
-	}
-}
-
-func (f FileAccessType) String() string {
-	switch f {
-	case FileAccessShared:
-		return "shared"
-	case FileAccessExclusive:
-		return "exclusive"
-	default:
-		return fmt.Sprintf("unknown(%d)", f)
-	}
-}
-
-// NetworkType tells which network stack to use.
-type NetworkType int
-
-const (
-	// NetworkSandbox uses internal network stack, isolated from the host.
-	NetworkSandbox NetworkType = iota
-
-	// NetworkHost redirects network related syscalls to the host network.
-	NetworkHost
-
-	// NetworkNone sets up just loopback using netstack.
-	NetworkNone
-)
-
-// MakeNetworkType converts type from string.
-func MakeNetworkType(s string) (NetworkType, error) {
-	switch s {
-	case "sandbox":
-		return NetworkSandbox, nil
-	case "host":
-		return NetworkHost, nil
-	case "none":
-		return NetworkNone, nil
-	default:
-		return 0, fmt.Errorf("invalid network type %q", s)
-	}
-}
-
-func (n NetworkType) String() string {
-	switch n {
-	case NetworkSandbox:
-		return "sandbox"
-	case NetworkHost:
-		return "host"
-	case NetworkNone:
-		return "none"
-	default:
-		return fmt.Sprintf("unknown(%d)", n)
-	}
-}
-
-// MakeWatchdogAction converts type from string.
-func MakeWatchdogAction(s string) (watchdog.Action, error) {
-	switch strings.ToLower(s) {
-	case "log", "logwarning":
-		return watchdog.LogWarning, nil
-	case "panic":
-		return watchdog.Panic, nil
-	default:
-		return 0, fmt.Errorf("invalid watchdog action %q", s)
-	}
-}
-
-// MakeRefsLeakMode converts type from string.
-func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
-	switch strings.ToLower(s) {
-	case "disabled":
-		return refs.NoLeakChecking, nil
-	case "log-names":
-		return refs.LeaksLogWarning, nil
-	case "log-traces":
-		return refs.LeaksLogTraces, nil
-	default:
-		return 0, fmt.Errorf("invalid refs leakmode %q", s)
-	}
-}
-
-func refsLeakModeToString(mode refs.LeakMode) string {
-	switch mode {
-	// If not set, default it to disabled.
-	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
-		return "disabled"
-	case refs.LeaksLogWarning:
-		return "log-names"
-	case refs.LeaksLogTraces:
-		return "log-traces"
-	default:
-		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
-	}
-}
-
-// Config holds configuration that is not part of the runtime spec.
-type Config struct {
-	// RootDir is the runtime root directory.
-	RootDir string
-
-	// Debug indicates that debug logging should be enabled.
-	Debug bool
-
-	// LogFilename is the filename to log to, if not empty.
-	LogFilename string
-
-	// LogFormat is the log format.
-	LogFormat string
-
-	// DebugLog is the path to log debug information to, if not empty.
-	DebugLog string
-
-	// PanicLog is the path to log GO's runtime messages, if not empty.
-	PanicLog string
-
-	// DebugLogFormat is the log format for debug.
-	DebugLogFormat string
-
-	// FileAccess indicates how the filesystem is accessed.
-	FileAccess FileAccessType
-
-	// Overlay is whether to wrap the root filesystem in an overlay.
-	Overlay bool
-
-	// FSGoferHostUDS enables the gofer to mount a host UDS.
-	FSGoferHostUDS bool
-
-	// Network indicates what type of network to use.
-	Network NetworkType
-
-	// EnableRaw indicates whether raw sockets should be enabled. Raw
-	// sockets are disabled by stripping CAP_NET_RAW from the list of
-	// capabilities.
-	EnableRaw bool
-
-	// HardwareGSO indicates that hardware segmentation offload is enabled.
-	HardwareGSO bool
-
-	// SoftwareGSO indicates that software segmentation offload is enabled.
-	SoftwareGSO bool
-
-	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
-	TXChecksumOffload bool
-
-	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
-	RXChecksumOffload bool
-
-	// QDisc indicates the type of queuening discipline to use by default
-	// for non-loopback interfaces.
-	QDisc QueueingDiscipline
-
-	// LogPackets indicates that all network packets should be logged.
-	LogPackets bool
-
-	// Platform is the platform to run on.
-	Platform string
-
-	// Strace indicates that strace should be enabled.
-	Strace bool
-
-	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
-	// true and this list is empty, then all syscalls will be traced.
-	StraceSyscalls []string
-
-	// StraceLogSize is the max size of data blobs to display.
-	StraceLogSize uint
-
-	// DisableSeccomp indicates whether seccomp syscall filters should be
-	// disabled. Pardon the double negation, but default to enabled is important.
-	DisableSeccomp bool
-
-	// WatchdogAction sets what action the watchdog takes when triggered.
-	WatchdogAction watchdog.Action
-
-	// PanicSignal registers signal handling that panics. Usually set to
-	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
-	PanicSignal int
-
-	// ProfileEnable is set to prepare the sandbox to be profiled.
-	ProfileEnable bool
-
-	// RestoreFile is the path to the saved container image
-	RestoreFile string
-
-	// NumNetworkChannels controls the number of AF_PACKET sockets that map
-	// to the same underlying network device. This allows netstack to better
-	// scale for high throughput use cases.
-	NumNetworkChannels int
-
-	// Rootless allows the sandbox to be started with a user that is not root.
-	// Defense is depth measures are weaker with rootless. Specifically, the
-	// sandbox and Gofer process run as root inside a user namespace with root
-	// mapped to the caller's user.
-	Rootless bool
-
-	// AlsoLogToStderr allows to send log messages to stderr.
-	AlsoLogToStderr bool
-
-	// ReferenceLeakMode sets reference leak check mode
-	ReferenceLeakMode refs.LeakMode
-
-	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
-	// is on a Linux overlayfs mount, which does not necessarily preserve
-	// coherence between read-only and subsequent writable file descriptors
-	// representing the "same" file.
-	OverlayfsStaleRead bool
-
-	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
-	// tests. It allows runsc to start the sandbox process as the current
-	// user, and without chrooting the sandbox process. This can be
-	// necessary in test environments that have limited capabilities.
-	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
-
-	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
-	// test name in the container environment variables and adds it to the debug
-	// log file name. This is done to help identify the log with the test when
-	// multiple tests are run in parallel, since there is no way to pass
-	// parameters to the runtime from docker.
-	TestOnlyTestNameEnv string
-
-	// CPUNumFromQuota sets CPU number count to available CPU quota, using
-	// least integer value greater than or equal to quota.
-	//
-	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
-	CPUNumFromQuota bool
-
-	// Enables VFS2 (not plumbled through yet).
-	VFS2 bool
-
-	// Enables FUSE usage (not plumbled through yet).
-	FUSE bool
-}
-
-// ToFlags returns a slice of flags that correspond to the given Config.
-func (c *Config) ToFlags() []string {
-	f := []string{
-		"--root=" + c.RootDir,
-		"--debug=" + strconv.FormatBool(c.Debug),
-		"--log=" + c.LogFilename,
-		"--log-format=" + c.LogFormat,
-		"--debug-log=" + c.DebugLog,
-		"--panic-log=" + c.PanicLog,
-		"--debug-log-format=" + c.DebugLogFormat,
-		"--file-access=" + c.FileAccess.String(),
-		"--overlay=" + strconv.FormatBool(c.Overlay),
-		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
-		"--network=" + c.Network.String(),
-		"--log-packets=" + strconv.FormatBool(c.LogPackets),
-		"--platform=" + c.Platform,
-		"--strace=" + strconv.FormatBool(c.Strace),
-		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
-		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
-		"--watchdog-action=" + c.WatchdogAction.String(),
-		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
-		"--profile=" + strconv.FormatBool(c.ProfileEnable),
-		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
-		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
-		"--rootless=" + strconv.FormatBool(c.Rootless),
-		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
-		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
-		"--gso=" + strconv.FormatBool(c.HardwareGSO),
-		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
-		"--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
-		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
-		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
-		"--qdisc=" + c.QDisc.String(),
-	}
-	if c.CPUNumFromQuota {
-		f = append(f, "--cpu-num-from-quota")
-	}
-	// Only include these if set since it is never to be used by users.
-	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		f = append(f, "--TESTONLY-unsafe-nonroot=true")
-	}
-	if len(c.TestOnlyTestNameEnv) != 0 {
-		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
-	}
-
-	if c.VFS2 {
-		f = append(f, "--vfs2=true")
-	}
-
-	if c.FUSE {
-		f = append(f, "--fuse=true")
-	}
-
-	return f
-}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 626a3816e..4e0f0d57a 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,6 +22,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -29,10 +30,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/state"
 	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -220,7 +223,7 @@ type StartArgs struct {
 	Spec *specs.Spec
 
 	// Config is the runsc-specific configuration for the sandbox.
-	Conf *Config
+	Conf *config.Config
 
 	// CID is the ID of the container to start.
 	CID string
@@ -256,13 +259,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	// All validation passed, logs the spec for debugging.
 	specutils.LogSpec(args.Spec)
 
-	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	fds, err := fd.NewFromFiles(args.FilePayload.Files)
 	if err != nil {
+		return err
+	}
+	defer func() {
+		for _, fd := range fds {
+			_ = fd.Close()
+		}
+	}()
+	if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
 	}
 	log.Debugf("Container %q started", args.CID)
-
 	return nil
 }
 
@@ -358,12 +368,20 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
+	ctx := k.SupervisorContext()
 	mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
-	renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
-	if err != nil {
-		return fmt.Errorf("creating RestoreEnvironment: %v", err)
+	if kernel.VFS2Enabled {
+		ctx, err = mntr.configureRestore(ctx, cm.l.root.conf)
+		if err != nil {
+			return fmt.Errorf("configuring filesystem restore: %v", err)
+		}
+	} else {
+		renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
+		if err != nil {
+			return fmt.Errorf("creating RestoreEnvironment: %v", err)
+		}
+		fs.SetRestoreEnvironment(*renv)
 	}
-	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
 	if eps, ok := networkStack.(*netstack.Stack); ok {
@@ -390,7 +408,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 
 	// Load the state.
 	loadOpts := state.LoadOpts{Source: specFile}
-	if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil {
+	if err := loadOpts.Load(ctx, k, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil {
 		return err
 	}
 
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 149eb0b1b..a7c4ebb0c 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -27,41 +27,30 @@ import (
 // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_CLOCK_GETTIME: {},
-	syscall.SYS_CLONE: []seccomp.Rule{
-		{
-			seccomp.AllowValue(
-				syscall.CLONE_VM |
-					syscall.CLONE_FS |
-					syscall.CLONE_FILES |
-					syscall.CLONE_SIGHAND |
-					syscall.CLONE_SYSVSEM |
-					syscall.CLONE_THREAD),
-		},
-	},
-	syscall.SYS_CLOSE: {},
-	syscall.SYS_DUP:   {},
+	syscall.SYS_CLOSE:         {},
+	syscall.SYS_DUP:           {},
 	syscall.SYS_DUP3: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.O_CLOEXEC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.O_CLOEXEC),
 		},
 	},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EVENTFD2: []seccomp.Rule{
 		{
-			seccomp.AllowValue(0),
-			seccomp.AllowValue(0),
+			seccomp.EqualTo(0),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_EXIT:       {},
@@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FCHMOD:     {},
 	syscall.SYS_FCNTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_SETFL),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_SETFL),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.F_GETFD),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.F_GETFD),
 		},
 	},
 	syscall.SYS_FSTAT:     {},
@@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_FTRUNCATE: {},
 	syscall.SYS_FUTEX: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.MatchAny{},
 		},
 		// Non-private variants are included for flipcall support. They are otherwise
 		// unncessary, as the sentry will use only private futexes internally.
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAIT),
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAIT),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(linux.FUTEX_WAKE),
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(linux.FUTEX_WAKE),
+			seccomp.MatchAny{},
 		},
 	},
 	syscall.SYS_GETPID: {},
 	unix.SYS_GETRANDOM: {},
 	syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_DOMAIN),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_DOMAIN),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_TYPE),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_TYPE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_ERROR),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_ERROR),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_SNDBUF),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.SOL_SOCKET),
+			seccomp.EqualTo(syscall.SO_SNDBUF),
 		},
 	},
 	syscall.SYS_GETTID:       {},
@@ -141,38 +130,44 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// setting/getting termios and winsize.
 	syscall.SYS_IOCTL: []seccomp.Rule{
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCGETS),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCGETS),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETS),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETS),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETSF),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETSF),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TCSETSW),
-			seccomp.AllowAny{}, /* termios struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TCSETSW),
+			seccomp.MatchAny{}, /* termios struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TIOCSWINSZ),
-			seccomp.AllowAny{}, /* winsize struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TIOCSWINSZ),
+			seccomp.MatchAny{}, /* winsize struct */
 		},
 		{
-			seccomp.AllowAny{}, /* fd */
-			seccomp.AllowValue(linux.TIOCGWINSZ),
-			seccomp.AllowAny{}, /* winsize struct */
+			seccomp.MatchAny{}, /* fd */
+			seccomp.EqualTo(linux.TIOCGWINSZ),
+			seccomp.MatchAny{}, /* winsize struct */
 		},
 	},
 	syscall.SYS_LSEEK:   {},
 	syscall.SYS_MADVISE: {},
+	unix.SYS_MEMBARRIER: []seccomp.Rule{
+		{
+			seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL),
+			seccomp.EqualTo(0),
+		},
+	},
 	syscall.SYS_MINCORE: {},
 	// Used by the Go runtime as a temporarily workaround for a Linux
 	// 5.2-5.4 bug.
@@ -182,46 +177,46 @@ var allowedSyscalls = seccomp.SyscallRules{
 	// TODO(b/148688965): Remove once this is gone from Go.
 	syscall.SYS_MLOCK: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(4096),
+			seccomp.MatchAny{},
+			seccomp.EqualTo(4096),
 		},
 	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_SHARED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_SHARED),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
-			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ),
+			seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
 		},
 	},
 	syscall.SYS_MPROTECT:  {},
@@ -237,32 +232,32 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_READ:      {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
 		},
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
 		},
 	},
 	syscall.SYS_RECVMMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
-			seccomp.AllowValue(syscall.MSG_DONTWAIT),
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(fdbased.MaxMsgsPerRecv),
+			seccomp.EqualTo(syscall.MSG_DONTWAIT),
+			seccomp.EqualTo(0),
 		},
 	},
 	unix.SYS_SENDMMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT),
-			seccomp.AllowValue(0),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT),
+			seccomp.EqualTo(0),
 		},
 	},
 	syscall.SYS_RESTART_SYSCALL: {},
@@ -272,49 +267,49 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SCHED_YIELD:     {},
 	syscall.SYS_SENDMSG: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
 		},
 	},
 	syscall.SYS_SETITIMER: {},
 	syscall.SYS_SHUTDOWN: []seccomp.Rule{
 		// Used by fs/host to shutdown host sockets.
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)},
 		// Used by unet to shutdown connections.
-		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+		{seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
 	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
 	syscall.SYS_TEE: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
-			seccomp.AllowValue(1),                      /* len */
-			seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
+			seccomp.EqualTo(1),                      /* len */
+			seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */
 		},
 	},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
-			seccomp.AllowValue(uint64(os.Getpid())),
+			seccomp.EqualTo(uint64(os.Getpid())),
 		},
 	},
 	syscall.SYS_UTIMENSAT: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0), /* null pathname */
-			seccomp.AllowAny{},
-			seccomp.AllowValue(0), /* flags */
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0), /* null pathname */
+			seccomp.MatchAny{},
+			seccomp.EqualTo(0), /* flags */
 		},
 	},
 	syscall.SYS_WRITE: {},
 	// For rawfile.NonBlockingWriteIovec.
 	syscall.SYS_WRITEV: []seccomp.Rule{
 		{
-			seccomp.AllowAny{},
-			seccomp.AllowAny{},
+			seccomp.MatchAny{},
+			seccomp.MatchAny{},
 			seccomp.GreaterThan(0),
 		},
 	},
@@ -325,10 +320,10 @@ func hostInetFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ACCEPT4: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
 			},
 		},
 		syscall.SYS_BIND:        {},
@@ -337,84 +332,84 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_GETSOCKNAME: {},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_TOS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVTOS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_TCLASS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_ERROR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_ERROR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_KEEPALIVE),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_KEEPALIVE),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_SNDBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_RCVBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_REUSEADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_TYPE),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_TYPE),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_LINGER),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_LINGER),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_NODELAY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_NODELAY),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_INFO),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_INFO),
 			},
 		},
 		syscall.SYS_IOCTL: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.TIOCOUTQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.TIOCOUTQ),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.TIOCINQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.TIOCINQ),
 			},
 		},
 		syscall.SYS_LISTEN:   {},
@@ -425,103 +420,103 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_SENDTO:   {},
 		syscall.SYS_SETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_V6ONLY),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_SNDBUF),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_RCVBUF),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_REUSEADDR),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_TCP),
-				seccomp.AllowValue(syscall.TCP_NODELAY),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(syscall.TCP_NODELAY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_TOS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_TOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IP),
-				seccomp.AllowValue(syscall.IP_RECVTOS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVTOS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_TCLASS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_TCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_IPV6),
-				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
-				seccomp.AllowAny{},
-				seccomp.AllowValue(4),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
 			},
 		},
 		syscall.SYS_SHUTDOWN: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_RD),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_RD),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_WR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_WR),
 			},
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SHUT_RDWR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SHUT_RDWR),
 			},
 		},
 		syscall.SYS_SOCKET: []seccomp.Rule{
 			{
-				seccomp.AllowValue(syscall.AF_INET),
-				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET),
+				seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET),
-				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET),
+				seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET6),
-				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET6),
+				seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 			{
-				seccomp.AllowValue(syscall.AF_INET6),
-				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
-				seccomp.AllowValue(0),
+				seccomp.EqualTo(syscall.AF_INET6),
+				seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.EqualTo(0),
 			},
 		},
 		syscall.SYS_WRITEV: {},
@@ -532,20 +527,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_ACCEPT: []seccomp.Rule{
 			{
-				seccomp.AllowValue(fd),
+				seccomp.EqualTo(fd),
 			},
 		},
 		syscall.SYS_LISTEN: []seccomp.Rule{
 			{
-				seccomp.AllowValue(fd),
-				seccomp.AllowValue(16 /* unet.backlog */),
+				seccomp.EqualTo(fd),
+				seccomp.EqualTo(16 /* unet.backlog */),
 			},
 		},
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.SOL_SOCKET),
-				seccomp.AllowValue(syscall.SO_PEERCRED),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_PEERCRED),
 			},
 		},
 	}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
index 5335ff82c..cea5613b8 100644
--- a/runsc/boot/filter/config_amd64.go
+++ b/runsc/boot/filter/config_amd64.go
@@ -24,8 +24,41 @@ import (
 )
 
 func init() {
-	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
-		seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	)
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+		// TODO(b/168828518): No longer used in Go 1.16+.
+		{seccomp.EqualTo(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		// parent_tidptr and child_tidptr are always 0 because neither
+		// CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SETTLS |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+		{
+			// TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			seccomp.EqualTo(0), // parent_tidptr
+			seccomp.EqualTo(0), // child_tidptr
+			seccomp.MatchAny{}, // tls
+		},
+	}
 }
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
index 7fa9bbda3..37313f97f 100644
--- a/runsc/boot/filter/config_arm64.go
+++ b/runsc/boot/filter/config_arm64.go
@@ -16,6 +16,29 @@
 
 package filter
 
-// Reserve for future customization.
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
 func init() {
+	allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+		{
+			seccomp.EqualTo(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+			seccomp.MatchAny{}, // newsp
+			// These arguments are left uninitialized by the Go
+			// runtime, so they may be anything (and are unused by
+			// the host).
+			seccomp.MatchAny{}, // parent_tidptr
+			seccomp.MatchAny{}, // tls
+			seccomp.MatchAny{}, // child_tidptr
+		},
+	}
 }
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
index 194952a7b..7b8669595 100644
--- a/runsc/boot/filter/config_profile.go
+++ b/runsc/boot/filter/config_profile.go
@@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules {
 	return seccomp.SyscallRules{
 		syscall.SYS_OPENAT: []seccomp.Rule{
 			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+				seccomp.MatchAny{},
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
 			},
 		},
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index a30fa198e..6b6ae98d7 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -34,6 +34,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
@@ -48,6 +49,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -66,7 +68,7 @@ const (
 // tmpfs has some extra supported options that we must pass through.
 var tmpfsAllowedData = []string{"mode", "uid", "gid"}
 
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
 	upperFlags := lowerFlags
 	upperFlags.ReadOnly = false
@@ -163,7 +165,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 }
 
 // p9MountData creates a slice of p9 mount data.
-func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
+func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
@@ -174,7 +176,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
 		// enablement.
 		opts = append(opts, "privateunixsocket=true")
 	}
-	if fa == FileAccessShared {
+	if fa == config.FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
 	}
 	return opts
@@ -259,7 +261,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
 
 // addSubmountOverlay overlays the inode over a ramfs tree containing the given
 // paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
 	// Construct a ramfs tree of mount points. The contents never
 	// change, so this can be fully caching. There's no real
 	// filesystem backing this tree, so we set the filesystem to
@@ -269,7 +271,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 	if err != nil {
 		return nil, fmt.Errorf("creating mount tree: %v", err)
 	}
-	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding mount overlay: %v", err)
 	}
@@ -288,7 +290,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
 	if conf.VFS2 {
 		return setupContainerVFS2(ctx, conf, mntr, procArgs)
 	}
@@ -326,14 +328,14 @@ func adjustDirentCache(k *kernel.Kernel) error {
 }
 
 type fdDispenser struct {
-	fds []int
+	fds []*fd.FD
 }
 
 func (f *fdDispenser) remove() int {
 	if f.empty() {
 		panic("fdDispenser out of fds")
 	}
-	rv := f.fds[0]
+	rv := f.fds[0].Release()
 	f.fds = f.fds[1:]
 	return rv
 }
@@ -459,27 +461,27 @@ func (m *mountHint) isSupported() bool {
 func (m *mountHint) checkCompatible(mount specs.Mount) error {
 	// Remove options that don't affect to mount's behavior.
 	masterOpts := filterUnsupportedOptions(m.mount)
-	slaveOpts := filterUnsupportedOptions(mount)
+	replicaOpts := filterUnsupportedOptions(mount)
 
-	if len(masterOpts) != len(slaveOpts) {
-		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	if len(masterOpts) != len(replicaOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 	}
 
 	sort.Strings(masterOpts)
-	sort.Strings(slaveOpts)
+	sort.Strings(replicaOpts)
 	for i, opt := range masterOpts {
-		if opt != slaveOpts[i] {
-			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		if opt != replicaOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
 		}
 	}
 	return nil
 }
 
-func (m *mountHint) fileAccessType() FileAccessType {
+func (m *mountHint) fileAccessType() config.FileAccessType {
 	if m.share == container {
-		return FileAccessExclusive
+		return config.FileAccessExclusive
 	}
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 func filterUnsupportedOptions(mount specs.Mount) []string {
@@ -570,7 +572,7 @@ type containerMounter struct {
 	hints *podMountHints
 }
 
-func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter {
 	return &containerMounter{
 		root:   spec.Root,
 		mounts: compileMounts(spec),
@@ -583,7 +585,7 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // processHints processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
+func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
 	if conf.VFS2 {
 		return c.processHintsVFS2(conf, creds)
 	}
@@ -607,7 +609,7 @@ func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) e
 // setupFS is used to set up the file system for all containers. This is the
 // main entry point method, with most of the other being internal only. It
 // returns the mount namespace that is created for the container.
-func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
 	log.Infof("Configuring container's file system")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -633,7 +635,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) {
 	rootInode, err := c.createRootMount(ctx, conf)
 	if err != nil {
 		return nil, fmt.Errorf("creating filesystem for container: %v", err)
@@ -645,7 +647,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error {
 	root := mns.Root()
 	defer root.DecRef(ctx)
 
@@ -681,7 +683,7 @@ func (c *containerMounter) checkDispenser() error {
 
 // mountSharedMaster mounts the master of a volume that is shared among
 // containers in a pod. It returns the root mount's inode.
-func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
@@ -721,7 +723,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config,
 }
 
 // createRootMount creates the root filesystem.
-func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) {
 	// First construct the filesystem from the spec.Root.
 	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
 
@@ -746,7 +748,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
 	// mounted even if they are not in the spec.
 	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding submount overlay: %v", err)
 	}
@@ -766,7 +768,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 
 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) {
 	var (
 		fsName     string
 		opts       []string
@@ -800,19 +802,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, nil
 }
 
-func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType {
 	if hint := c.hints.findMount(mount); hint != nil {
 		return hint.fileAccessType()
 	}
 	// Non-root bind mounts are always shared if no hints were provided.
-	return FileAccessShared
+	return config.FileAccessShared
 }
 
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
 // 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
@@ -856,7 +858,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 	submounts := subtargets(m.Destination, c.mounts)
 	if len(submounts) > 0 {
 		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
 		if err != nil {
 			return fmt.Errorf("adding submount overlay: %v", err)
 		}
@@ -911,7 +913,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
-func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
 	if err != nil {
 		return err
@@ -936,7 +938,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
 
 // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
 // the mounts to the environment.
-func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) {
 	renv := &fs.RestoreEnvironment{
 		MountSources: make(map[string][]fs.MountArgs),
 	}
@@ -991,7 +993,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error {
 	for _, m := range c.mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 912037075..e986231e5 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -20,6 +20,7 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func TestPodMountHintsHappy(t *testing.T) {
@@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) {
 	for _, tst := range []struct {
 		name        string
 		annotations map[string]string
-		want        FileAccessType
+		want        config.FileAccessType
 	}{
 		{
 			name: "container=exclusive",
@@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessExclusive,
+			want: config.FileAccessExclusive,
 		},
 		{
 			name: "pod=shared",
@@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "pod",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "shared=shared",
@@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "shared",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 		{
 			name: "default=shared",
@@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) {
 				MountPrefix + "mount1.type":   "bind",
 				MountPrefix + "mount1.share":  "container",
 			},
-			want: FileAccessShared,
+			want: config.FileAccessShared,
 		},
 	} {
 		t.Run(tst.name, func(t *testing.T) {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 40c6f99fd..8c6ab213d 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -27,12 +27,15 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
@@ -67,7 +70,9 @@ import (
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
 	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
+	"gvisor.dev/gvisor/runsc/specutils/seccomp"
 
 	// Include supported socket providers.
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
@@ -79,7 +84,7 @@ import (
 )
 
 type containerInfo struct {
-	conf *Config
+	conf *config.Config
 
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
@@ -88,10 +93,10 @@ type containerInfo struct {
 	procArgs kernel.CreateProcessArgs
 
 	// stdioFDs contains stdin, stdout, and stderr.
-	stdioFDs []int
+	stdioFDs []*fd.FD
 
 	// goferFDs are the FDs that attach the sandbox to the gofers.
-	goferFDs []int
+	goferFDs []*fd.FD
 }
 
 // Loader keeps state needed to start the kernel and run the container..
@@ -165,7 +170,7 @@ type Args struct {
 	// Spec is the sandbox specification.
 	Spec *specs.Spec
 	// Conf is the system configuration.
-	Conf *Config
+	Conf *config.Config
 	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
 	// of this FD and may close it at any time.
 	ControllerFD int
@@ -278,6 +283,7 @@ func New(args Args) (*Loader, error) {
 		args.NumCPU = runtime.NumCPU()
 	}
 	log.Infof("CPUs: %d", args.NumCPU)
+	runtime.GOMAXPROCS(args.NumCPU)
 
 	if args.TotalMem > 0 {
 		// Adjust the total memory returned by the Sentry so that applications that
@@ -355,12 +361,17 @@ func New(args Args) (*Loader, error) {
 		k.SetHostMount(hostMount)
 	}
 
+	info := containerInfo{
+		conf:     args.Conf,
+		spec:     args.Spec,
+		procArgs: procArgs,
+	}
+
 	// Make host FDs stable between invocations. Host FDs must map to the exact
 	// same number when the sandbox is restored. Otherwise the wrong FD will be
 	// used.
-	var stdioFDs []int
 	newfd := startingStdioFD
-	for _, fd := range args.StdioFDs {
+	for _, stdioFD := range args.StdioFDs {
 		// Check that newfd is unused to avoid clobbering over it.
 		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
 			if err != nil {
@@ -369,14 +380,17 @@ func New(args Args) (*Loader, error) {
 			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
 		}
 
-		err := unix.Dup3(fd, newfd, unix.O_CLOEXEC)
+		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
 		if err != nil {
-			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
 		}
-		stdioFDs = append(stdioFDs, newfd)
-		_ = unix.Close(fd)
+		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+		_ = unix.Close(stdioFD)
 		newfd++
 	}
+	for _, goferFD := range args.GoferFDs {
+		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+	}
 
 	eid := execID{cid: args.ID}
 	l := &Loader{
@@ -385,13 +399,7 @@ func New(args Args) (*Loader, error) {
 		sandboxID:  args.ID,
 		processes:  map[execID]*execProcess{eid: {}},
 		mountHints: mountHints,
-		root: containerInfo{
-			conf:     args.Conf,
-			stdioFDs: stdioFDs,
-			goferFDs: args.GoferFDs,
-			spec:     args.Spec,
-			procArgs: procArgs,
-		},
+		root:       info,
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -465,13 +473,28 @@ func (l *Loader) Destroy() {
 	}
 	l.watchdog.Stop()
 
-	for i, fd := range l.root.stdioFDs {
-		_ = unix.Close(fd)
-		l.root.stdioFDs[i] = -1
+	// Release all kernel resources. This is only safe after we can no longer
+	// save/restore.
+	l.k.Release()
+
+	// All sentry-created resources should have been released at this point;
+	// check for reference leaks.
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.DoLeakCheck()
+	}
+
+	// In the success case, stdioFDs and goferFDs will only contain
+	// released/closed FDs that ownership has been passed over to host FDs and
+	// gofer sessions. Close them here in case of failure.
+	for _, fd := range l.root.stdioFDs {
+		_ = fd.Close()
+	}
+	for _, fd := range l.root.goferFDs {
+		_ = fd.Close()
 	}
 }
 
-func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
 	p, err := platform.Lookup(conf.Platform)
 	if err != nil {
 		panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
@@ -498,13 +521,14 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	return mf, nil
 }
 
+// installSeccompFilters installs sandbox seccomp filters with the host.
 func (l *Loader) installSeccompFilters() error {
 	if l.root.conf.DisableSeccomp {
 		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
 	} else {
 		opts := filter.Options{
 			Platform:      l.k.Platform,
-			HostNetwork:   l.root.conf.Network == NetworkHost,
+			HostNetwork:   l.root.conf.Network == config.NetworkHost,
 			ProfileEnable: l.root.conf.ProfileEnable,
 			ControllerFD:  l.ctrl.srv.FD(),
 		}
@@ -531,7 +555,7 @@ func (l *Loader) Run() error {
 }
 
 func (l *Loader) run() error {
-	if l.root.conf.Network == NetworkHost {
+	if l.root.conf.Network == config.NetworkHost {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
@@ -568,6 +592,7 @@ func (l *Loader) run() error {
 		if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
 			return err
 		}
+
 	}
 
 	ep.tg = l.k.GlobalInit()
@@ -597,17 +622,6 @@ func (l *Loader) run() error {
 		}
 	})
 
-	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
-	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
-	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
-	// passed FDs, so only close for VFS1.
-	if !kernel.VFS2Enabled {
-		for i, fd := range l.root.stdioFDs {
-			_ = unix.Close(fd)
-			l.root.stdioFDs[i] = -1
-		}
-	}
-
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
 	return l.k.Start()
@@ -627,9 +641,9 @@ func (l *Loader) createContainer(cid string) error {
 }
 
 // startContainer starts a child container. It returns the thread group ID of
-// the newly created process. Caller owns 'files' and may close them after
-// this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+// the newly created process. Used FDs are either closed or released. It's safe
+// for the caller to close any remaining files upon return.
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -680,28 +694,15 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 	}
 
 	info := &containerInfo{
-		conf: conf,
-		spec: spec,
+		conf:     conf,
+		spec:     spec,
+		stdioFDs: files[:3],
+		goferFDs: files[3:],
 	}
 	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
 	if err != nil {
 		return fmt.Errorf("creating new process: %v", err)
 	}
-
-	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
-	for _, f := range files[:3] {
-		info.stdioFDs = append(info.stdioFDs, int(f.Fd()))
-	}
-
-	// Can't take ownership away from os.File. dup them to get a new FDs.
-	for _, f := range files[3:] {
-		fd, err := unix.Dup(int(f.Fd()))
-		if err != nil {
-			return fmt.Errorf("failed to dup file: %v", err)
-		}
-		info.goferFDs = append(info.goferFDs, fd)
-	}
-
 	tg, err := l.createContainerProcess(false, cid, info, ep)
 	if err != nil {
 		return err
@@ -743,7 +744,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 		return nil, err
 	}
 
-	// Add the HOME enviroment variable if it is not already set.
+	// Add the HOME environment variable if it is not already set.
 	var envv []string
 	if kernel.VFS2Enabled {
 		envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
@@ -779,19 +780,44 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 		}
 	}
 
+	// Install seccomp filters with the new task if there are any.
+	if info.conf.OCISeccomp {
+		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
+			if err != nil {
+				return nil, fmt.Errorf("building seccomp program: %v", err)
+			}
+
+			if log.IsLogging(log.Debug) {
+				out, _ := bpf.DecodeProgram(program)
+				log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
+			}
+
+			task := tg.Leader()
+			// NOTE: It seems Flags are ignored by runc so we ignore them too.
+			if err := task.AppendSyscallFilter(program, true); err != nil {
+				return nil, fmt.Errorf("appending seccomp filters: %v", err)
+			}
+		}
+	} else {
+		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+			log.Warningf("Seccomp spec is being ignored")
+		}
+	}
+
 	return tg, nil
 }
 
 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and destroys the container if a
+// the gofer FDs looking for disconnects, and kills the container processes if a
 // disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
 	go func() {
 		log.Debugf("Monitoring gofer health for container %q", cid)
 		var events []unix.PollFd
-		for _, fd := range goferFDs {
+		for _, goferFD := range goferFDs {
 			events = append(events, unix.PollFd{
-				Fd:     int32(fd),
+				Fd:     int32(goferFD.FD()),
 				Events: unix.POLLHUP | unix.POLLRDHUP,
 			})
 		}
@@ -804,18 +830,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
 			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
 		}
 
-		// Check if the gofer has stopped as part of normal container destruction.
-		// This is done just to avoid sending an annoying error message to the log.
-		// Note that there is a small race window in between mu.Unlock() and the
-		// lock being reacquired in destroyContainer(), but it's harmless to call
-		// destroyContainer() multiple times.
 		l.mu.Lock()
-		_, ok := l.processes[execID{cid: cid}]
-		l.mu.Unlock()
-		if ok {
-			log.Infof("Gofer socket disconnected, destroying container %q", cid)
-			if err := l.destroyContainer(cid); err != nil {
-				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+		defer l.mu.Unlock()
+
+		// The gofer could have been stopped due to a normal container shutdown.
+		// Check if the container has not stopped yet.
+		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
+			log.Infof("Gofer socket disconnected, killing container %q", cid)
+			if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+				log.Warningf("Error killing container %q after gofer stopped: %v", cid, err)
 			}
 		}
 	}()
@@ -884,17 +907,24 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
-	// Get the container MountNamespace from the Task.
+	// Get the container MountNamespace from the Task. Try to acquire ref may fail
+	// in case it raced with task exit.
 	if kernel.VFS2Enabled {
-		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		// task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves.
 		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
-		args.MountNamespaceVFS2.IncRef()
+		if !args.MountNamespaceVFS2.TryIncRef() {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	} else {
+		var reffed bool
 		tg.Leader().WithMuLocked(func(t *kernel.Task) {
 			// task.MountNamespace() does not take a ref, so we must do so ourselves.
 			args.MountNamespace = t.MountNamespace()
-			args.MountNamespace.IncRef()
+			reffed = args.MountNamespace.TryIncRef()
 		})
+		if !reffed {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	}
 
 	// Add the HOME environment variable if it is not already set.
@@ -902,7 +932,6 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		root := args.MountNamespaceVFS2.Root()
 		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
 		defer args.MountNamespaceVFS2.DecRef(ctx)
-		defer root.DecRef(ctx)
 		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
 		if err != nil {
 			return 0, err
@@ -1017,17 +1046,17 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
 	// Create an empty network stack because the network namespace may be empty at
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
 	switch conf.Network {
-	case NetworkHost:
+	case config.NetworkHost:
 		// No network namespacing support for hostinet yet, hence creator is nil.
 		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
-	case NetworkNone, NetworkSandbox:
+	case config.NetworkNone, config.NetworkSandbox:
 		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
 		if err != nil {
 			return nil, err
@@ -1045,8 +1074,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni
 }
 
 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
-	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
+	transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
 	s := netstack.Stack{stack.New(stack.Options{
 		NetworkProtocols:   netProtos,
 		TransportProtocols: transProtos,
@@ -1060,17 +1089,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
 	})}
 
 	// Enable SACK Recovery.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-		return nil, fmt.Errorf("failed to enable SACK: %s", err)
+	{
+		opt := tcpip.TCPSACKEnabled(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	// Set default TTLs as required by socket/netstack.
-	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	{
+		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+		}
+		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+		}
+	}
 
 	// Enable Receive Buffer Auto-Tuning.
-	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+	{
+		opt := tcpip.TCPModerateReceiveBufferOption(true)
+		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+		}
 	}
 
 	return &s, nil
@@ -1266,7 +1308,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
 	return ep.tty, ep.ttyVFS2, nil
 }
 
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	if len(stdioFDs) != 3 {
 		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index aa3fdf96c..b77b4762e 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,6 +26,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
@@ -34,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 )
 
@@ -43,15 +45,19 @@ func init() {
 	if err := fsgofer.OpenProcSelfFD(); err != nil {
 		panic(err)
 	}
+	config.RegisterFlags()
 }
 
-func testConfig() *Config {
-	return &Config{
-		RootDir:        "unused_root_dir",
-		Network:        NetworkNone,
-		DisableSeccomp: true,
-		Platform:       "ptrace",
+func testConfig() *config.Config {
+	conf, err := config.NewFromFlags()
+	if err != nil {
+		panic(err)
 	}
+	// Change test defaults.
+	conf.RootDir = "unused_root_dir"
+	conf.Network = config.NetworkNone
+	conf.DisableSeccomp = true
+	return conf
 }
 
 // testSpec returns a simple spec that can be used in tests.
@@ -258,9 +264,9 @@ type CreateMountTestcase struct {
 	expectedPaths []string
 }
 
-func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+func createMountTestcases() []*CreateMountTestcase {
 	testCases := []*CreateMountTestcase{
-		&CreateMountTestcase{
+		{
 			// Only proc.
 			name: "only proc mount",
 			spec: specs.Spec{
@@ -298,11 +304,10 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 					},
 				},
 			},
-			// /some/deep/path should be mounted, along with /proc,
-			// /dev, and /sys.
+			// /some/deep/path should be mounted, along with /proc, /dev, and /sys.
 			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		&CreateMountTestcase{
+		{
 			// Mounts are nested inside each other.
 			name: "nested mounts",
 			spec: specs.Spec{
@@ -346,7 +351,7 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
 				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		&CreateMountTestcase{
+		{
 			name: "mount inside /dev",
 			spec: specs.Spec{
 				Root: &specs.Root{
@@ -389,46 +394,42 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 			},
 			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
 		},
-	}
-
-	vfsCase := &CreateMountTestcase{
-		name: "mounts inside mandatory mounts",
-		spec: specs.Spec{
-			Root: &specs.Root{
-				Path:     os.TempDir(),
-				Readonly: true,
-			},
-			Mounts: []specs.Mount{
-				{
-					Destination: "/proc",
-					Type:        "tmpfs",
+		{
+			name: "mounts inside mandatory mounts",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
 				},
-				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
-				//  MkDirAt in VFS2 (and remove the reduntant append).
-				// {
-				//		Destination: "/sys/bar",
-				//		Type:        "tmpfs",
-				//	},
-				//
-				{
-					Destination: "/tmp/baz",
-					Type:        "tmpfs",
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/sys/bar",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/tmp/baz",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev/goo",
+						Type:        "tmpfs",
+					},
 				},
 			},
+			expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz", "/dev/goo"},
 		},
-		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
 	}
 
-	if !vfs2 {
-		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
-		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
-	}
-	return append(testCases, vfsCase)
+	return testCases
 }
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespace(t *testing.T) {
-	for _, tc := range createMountTestcases(false /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
@@ -439,7 +440,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+			mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{})
 			mns, err := mntr.createMountNamespace(ctx, conf)
 			if err != nil {
 				t.Fatalf("failed to create mount namespace: %v", err)
@@ -465,7 +466,7 @@ func TestCreateMountNamespace(t *testing.T) {
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespaceVFS2(t *testing.T) {
-	for _, tc := range createMountTestcases(true /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			spec := testSpec()
 			spec.Mounts = tc.spec.Mounts
@@ -485,12 +486,13 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			}
 
 			ctx := l.k.SupervisorContext()
-			mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs)
+			mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
 			if err != nil {
-				t.Fatalf("failed to setupVFS2: %v", err)
+				t.Fatalf("mountAll: %v", err)
 			}
 
 			root := mns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			for _, p := range tc.expectedPaths {
 				target := &vfs.PathOperation{
@@ -545,7 +547,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -599,7 +601,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 						{
 							Dev:        "9pfs-/dev/fd-foo",
@@ -657,7 +659,7 @@ func TestRestoreEnvironment(t *testing.T) {
 						{
 							Dev:        "9pfs-/",
 							Flags:      fs.MountSourceFlags{ReadOnly: true},
-							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
 						},
 					},
 					"tmpfs": {
@@ -697,7 +699,11 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+			var ioFDs []*fd.FD
+			for _, ioFD := range tc.ioFDs {
+				ioFDs = append(ioFDs, fd.New(ioFD))
+			}
+			mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{})
 			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 4e1fa7665..988573640 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -33,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 var (
@@ -78,44 +79,6 @@ type DefaultRoute struct {
 	Name  string
 }
 
-// QueueingDiscipline is used to specify the kind of Queueing Discipline to
-// apply for a give FDBasedLink.
-type QueueingDiscipline int
-
-const (
-	// QDiscNone disables any queueing for the underlying FD.
-	QDiscNone QueueingDiscipline = iota
-
-	// QDiscFIFO applies a simple fifo based queue to the underlying
-	// FD.
-	QDiscFIFO
-)
-
-// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
-// else returns an error.
-func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
-	switch s {
-	case "none":
-		return QDiscNone, nil
-	case "fifo":
-		return QDiscFIFO, nil
-	default:
-		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
-	}
-}
-
-// String implements fmt.Stringer.
-func (q QueueingDiscipline) String() string {
-	switch q {
-	case QDiscNone:
-		return "none"
-	case QDiscFIFO:
-		return "fifo"
-	default:
-		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
-	}
-}
-
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
 	Name               string
@@ -127,7 +90,7 @@ type FDBasedLink struct {
 	TXChecksumOffload  bool
 	RXChecksumOffload  bool
 	LinkAddress        net.HardwareAddr
-	QDisc              QueueingDiscipline
+	QDisc              config.QueueingDiscipline
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -247,8 +210,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		switch link.QDisc {
-		case QDiscNone:
-		case QDiscFIFO:
+		case config.QDiscNone:
+		case config.QDiscFIFO:
 			log.Infof("Enabling FIFO QDisc on %q", link.Name)
 			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
 		}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index fbfd3b07c..c21648a32 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -15,10 +15,13 @@
 package boot
 
 import (
+	"strings"
+
 	"gvisor.dev/gvisor/pkg/sentry/strace"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
-func enableStrace(conf *Config) error {
+func enableStrace(conf *config.Config) error {
 	// We must initialize even if strace is not enabled.
 	strace.Initialize()
 
@@ -36,5 +39,5 @@ func enableStrace(conf *Config) error {
 		strace.EnableAll(strace.SinkTypeLog)
 		return nil
 	}
-	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+	return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog)
 }
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 08dce8b6c..b157387ef 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,12 +16,12 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"sort"
 	"strings"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
@@ -42,6 +42,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/config"
 )
 
 func registerFilesystems(k *kernel.Kernel) error {
@@ -133,8 +134,8 @@ func registerFilesystems(k *kernel.Kernel) error {
 	return nil
 }
 
-func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
-	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	mns, err := mntr.mountAll(conf, procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to setupFS: %w", err)
 	}
@@ -149,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte
 	return nil
 }
 
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
 	log.Infof("Configuring container's file system with VFS2")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -168,35 +169,141 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 	}
 	rootProcArgs.MountNamespaceVFS2 = mns
 
+	root := mns.Root()
+	root.IncRef()
+	defer root.DecRef(rootCtx)
+	if root.Mount().ReadOnly() {
+		// Switch to ReadWrite while we setup submounts.
+		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
+			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
+		}
+		// Restore back to ReadOnly at the end.
+		defer func() {
+			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
+			}
+		}()
+	}
+
 	// Mount submounts.
 	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
 		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
 	}
+
 	return mns, nil
 }
 
-func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
 	fd := c.fds.remove()
-	opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+	data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
 
 	if conf.OverlayfsStaleRead {
 		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
 		// can only send mount options for specs.Mounts (specs.Root is missing
 		// Options field). So assume root is always on top of overlayfs.
-		opts = append(opts, "overlayfs_stale_read")
+		data = append(data, "overlayfs_stale_read")
 	}
 
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
-		Data: strings.Join(opts, ","),
-	})
+	opts := &vfs.MountOptions{
+		ReadOnly: c.root.Readonly,
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+			InternalData: gofer.InternalFilesystemOptions{
+				UniqueID: "/",
+			},
+		},
+		InternalMount: true,
+	}
+
+	fsName := gofer.Name
+	if conf.Overlay && !c.root.Readonly {
+		log.Infof("Adding overlay on top of root")
+		var err error
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting root with overlay: %w", err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
 	if err != nil {
 		return nil, fmt.Errorf("setting up mount namespace: %w", err)
 	}
 	return mns, nil
 }
 
-func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
+// layer using tmpfs, and return overlay mount options. "cleanup" must be called
+// after the options have been used to mount the overlay, to release refs on
+// lower and upper mounts.
+func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
+	// First copy options from lower layer to upper layer and overlay. Clear
+	// filesystem specific options.
+	upperOpts := *lowerOpts
+	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+	overlayOpts := *lowerOpts
+	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+	// Next mount upper and lower. Upper is a tmpfs mount to keep all
+	// modifications inside the sandbox.
+	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+	}
+	cu := cleanup.Make(func() { upper.DecRef(ctx) })
+	defer cu.Clean()
+
+	// All writes go to the upper layer, be paranoid and make lower readonly.
+	lowerOpts.ReadOnly = true
+	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
+	if err != nil {
+		return nil, nil, err
+	}
+	cu.Add(func() { lower.DecRef(ctx) })
+
+	// Propagate the lower layer's root's owner, group, and mode to the upper
+	// layer's root for consistency with VFS1.
+	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
+	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
+	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerRootVD,
+		Start: lowerRootVD,
+	}, &vfs.StatOptions{
+		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE,
+	})
+	if err != nil {
+		return nil, nil, err
+	}
+	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
+		Root:  upperRootVD,
+		Start: upperRootVD,
+	}, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask,
+			UID:  stat.UID,
+			GID:  stat.GID,
+			Mode: stat.Mode,
+		},
+	})
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Configure overlay with both layers.
+	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
+		UpperRoot:  upperRootVD,
+		LowerRoots: []vfs.VirtualDentry{lowerRootVD},
+	}
+	return &overlayOpts, cu.Release(), nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
 	mounts, err := c.prepareMountsVFS2()
 	if err != nil {
 		return err
@@ -205,15 +312,35 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
 	for i := range mounts {
 		submount := &mounts[i]
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		var (
+			mnt *vfs.Mount
+			err error
+		)
+
 		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
-			if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil {
+			mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
+			if err != nil {
 				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
 			}
 		} else {
-			if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+			mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
+			if err != nil {
 				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
 			}
 		}
+
+		if mnt != nil && mnt.ReadOnly() {
+			// Switch to ReadWrite while we setup submounts.
+			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
+				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
+			}
+			// Restore back to ReadOnly at the end.
+			defer func() {
+				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
+					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
+				}
+			}()
+		}
 	}
 
 	if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
@@ -256,38 +383,54 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 	return mounts, nil
 }
 
-func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
+	if err != nil {
+		return nil, fmt.Errorf("mountOptions failed: %w", err)
+	}
+	if len(fsName) == 0 {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil, nil
+	}
+
+	if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
+	}
+
+	if useOverlay {
+		log.Infof("Adding overlay on top of mount %q", submount.Destination)
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	target := &vfs.PathOperation{
 		Root:  root,
 		Start: root,
 		Path:  fspath.Parse(submount.Destination),
 	}
-	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
 	if err != nil {
-		return fmt.Errorf("mountOptions failed: %w", err)
-	}
-	if len(fsName) == 0 {
-		// Filesystem is not supported (e.g. cgroup), just skip it.
-		return nil
-	}
-
-	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
-		return err
-	}
-	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
-		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
 	}
 	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
-	return nil
+	return mnt, nil
 }
 
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
 	fsName := m.Type
+	useOverlay := false
 	var data []string
+	var iopts interface{}
 
 	// Find filesystem name and FS specific data field.
 	switch m.Type {
@@ -301,7 +444,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		var err error
 		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
-			return "", nil, err
+			return "", nil, false, err
 		}
 
 	case bind:
@@ -309,18 +452,25 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		if m.fd == 0 {
 			// Check that an FD was provided to fails fast. Technically FD=0 is valid,
 			// but unlikely to be correct in this context.
-			return "", nil, fmt.Errorf("9P mount requires a connection FD")
+			return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
 		}
 		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+		iopts = gofer.InternalFilesystemOptions{
+			UniqueID: m.Destination,
+		}
+
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
-		return "", nil, nil
+		return "", nil, false, nil
 	}
 
 	opts := &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
-			Data: strings.Join(data, ","),
+			Data:         strings.Join(data, ","),
+			InternalData: iopts,
 		},
 		InternalMount: true,
 	}
@@ -340,38 +490,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		}
 	}
 
-	if conf.Overlay {
-		// All writes go to upper, be paranoid and make lower readonly.
-		opts.ReadOnly = true
-	}
-	return fsName, opts, nil
-}
-
-func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
-	target := &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(currentPath),
-	}
-	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
-	if err == nil {
-		log.Debugf("Mount point %q already exists", currentPath)
-		return nil
-	}
-	if err != syserror.ENOENT {
-		return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
-	}
-
-	// Recurse to ensure parent is created and then create the mount point.
-	if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
-		return err
-	}
-	log.Debugf("Creating dir %q for mount point", currentPath)
-	mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
-	if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
-		return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
-	}
-	return nil
+	return fsName, opts, useOverlay, nil
 }
 
 // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
@@ -383,7 +502,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
 	for _, m := range c.mounts {
 		// m.Destination has been cleaned, so it's to use equality here.
 		if m.Destination == "/tmp" {
@@ -393,6 +512,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 	}
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	pop := vfs.PathOperation{
 		Root:  root,
@@ -434,7 +554,8 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=01777"},
 		}
-		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		return err
 
 	case syserror.ENOTDIR:
 		// Not a dir?! Let it be.
@@ -448,7 +569,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
 // processHintsVFS2 processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error {
+func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
 		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
@@ -469,51 +590,106 @@ func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credential
 
 // mountSharedMasterVFS2 mounts the master of a volume that is shared among
 // containers in a pod.
-func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	mntFD := &mountAndFD{Mount: hint.mount}
-	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
 	if err != nil {
 		return nil, err
 	}
 	if len(fsName) == 0 {
 		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
 	}
+
+	if useOverlay {
+		log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+		var cleanup func()
+		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+		if err != nil {
+			return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+		}
+		defer cleanup()
+		fsName = overlay.Name
+	}
+
 	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
 }
 
 // mountSharedSubmount binds mount to a previously mounted volume that is shared
 // among containers in the same pod.
-func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
 	if err := source.checkCompatible(mount); err != nil {
-		return err
+		return nil, err
 	}
 
-	_, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+	// Ignore data and useOverlay because these were already applied to
+	// the master mount.
+	_, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
 	if err != nil {
-		return err
+		return nil, err
 	}
 	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	defer newMnt.DecRef(ctx)
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
-	if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil {
-		return err
-	}
-
 	target := &vfs.PathOperation{
 		Root:  root,
 		Start: root,
 		Path:  fspath.Parse(mount.Destination),
 	}
+
+	if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
+	}
+
 	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
-		return err
+		return nil, err
 	}
 	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
-	return nil
+	return newMnt, nil
+}
+
+func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
+	root := mns.Root()
+	root.IncRef()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dest),
+	}
+	// First check if mount point exists. When overlay is enabled, gofer doesn't
+	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
+	// because MkdirAt fails with EROFS even if file exists.
+	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
+	if err == nil {
+		// File exists, we're done.
+		vd.DecRef(ctx)
+		return nil
+	}
+	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
+}
+
+// configureRestore returns an updated context.Context including filesystem
+// state used by restore defined by conf.
+func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) {
+	fdmap := make(map[string]int)
+	fdmap["/"] = c.fds.remove()
+	mounts, err := c.prepareMountsVFS2()
+	if err != nil {
+		return ctx, err
+	}
+	for i := range c.mounts {
+		submount := &mounts[i]
+		if submount.fd >= 0 {
+			fdmap[submount.Destination] = submount.fd
+		}
+	}
+	return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil
 }