37 files changed, 1020 insertions, 433 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
index 3b91b984a..7a7dcc8d5 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -9,6 +9,7 @@ go_binary(
         "version.go",
     ],
     pure = True,
+    tags = ["staging"],
     visibility = [
         "//visibility:public",
     ],
@@ -49,5 +50,4 @@ sh_test(
     srcs = ["version_test.sh"],
     args = ["$(location :runsc)"],
     data = [":runsc"],
-    tags = ["noguitar"],
 )
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 67307ab3c..d51347fe1 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//pkg/cleanup",
         "//pkg/context",
         "//pkg/control/server",
+        "//pkg/coverage",
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/fd",
@@ -37,6 +38,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
+        "//pkg/metric",
         "//pkg/rand",
         "//pkg/refs",
         "//pkg/refsvfs2",
@@ -57,6 +59,7 @@ go_library(
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/fs/tty",
         "//pkg/sentry/fs/user",
+        "//pkg/sentry/fsimpl/cgroupfs",
         "//pkg/sentry/fsimpl/devpts",
         "//pkg/sentry/fsimpl/devtmpfs",
         "//pkg/sentry/fsimpl/fuse",
@@ -66,6 +69,7 @@ go_library(
         "//pkg/sentry/fsimpl/proc",
         "//pkg/sentry/fsimpl/sys",
         "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/fsimpl/verity",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel:uncaught_signal_go_proto",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 1ae76d7d7..9b270cbf2 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -400,9 +400,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 
 	// Set up the restore environment.
 	ctx := k.SupervisorContext()
-	mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints, kernel.VFS2Enabled)
+	mntr := newContainerMounter(&cm.l.root, cm.l.k, cm.l.mountHints, kernel.VFS2Enabled)
 	if kernel.VFS2Enabled {
-		ctx, err = mntr.configureRestore(ctx, cm.l.root.conf)
+		ctx, err = mntr.configureRestore(ctx)
 		if err != nil {
 			return fmt.Errorf("configuring filesystem restore: %v", err)
 		}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 32adde643..bf4a41f77 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/cgroupfs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
 	gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
@@ -103,17 +104,22 @@ func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name
 
 // compileMounts returns the supported mounts from the mount spec, adding any
 // mandatory mounts that are required by the OCI specification.
-func compileMounts(spec *specs.Spec, vfs2Enabled bool) []specs.Mount {
+func compileMounts(spec *specs.Spec, conf *config.Config, vfs2Enabled bool) []specs.Mount {
 	// Keep track of whether proc and sys were mounted.
 	var procMounted, sysMounted, devMounted, devptsMounted bool
 	var mounts []specs.Mount
 
 	// Mount all submounts from the spec.
 	for _, m := range spec.Mounts {
-		if !vfs2Enabled && !specutils.IsVFS1SupportedDevMount(m) {
+		if !specutils.IsSupportedDevMount(m, vfs2Enabled) {
 			log.Warningf("ignoring dev mount at %q", m.Destination)
 			continue
 		}
+		// Unconditionally drop any cgroupfs mounts. If requested, we'll add our
+		// own below.
+		if m.Type == cgroupfs.Name {
+			continue
+		}
 		switch filepath.Clean(m.Destination) {
 		case "/proc":
 			procMounted = true
@@ -132,6 +138,24 @@ func compileMounts(spec *specs.Spec, vfs2Enabled bool) []specs.Mount {
 	// Mount proc and sys even if the user did not ask for it, as the spec
 	// says we SHOULD.
 	var mandatoryMounts []specs.Mount
+
+	if conf.Cgroupfs {
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        tmpfsvfs2.Name,
+			Destination: "/sys/fs/cgroup",
+		})
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        cgroupfs.Name,
+			Destination: "/sys/fs/cgroup/memory",
+			Options:     []string{"memory"},
+		})
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        cgroupfs.Name,
+			Destination: "/sys/fs/cgroup/cpu",
+			Options:     []string{"cpu"},
+		})
+	}
+
 	if !procMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
 			Type:        procvfs2.Name,
@@ -208,7 +232,7 @@ func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
 
 // mountDevice returns a device string based on the fs type and target
 // of the mount.
-func mountDevice(m specs.Mount) string {
+func mountDevice(m *specs.Mount) string {
 	if m.Type == bind {
 		// Make a device string that includes the target, which is consistent across
 		// S/R and uniquely identifies the connection.
@@ -232,6 +256,8 @@ func mountFlags(opts []string) fs.MountSourceFlags {
 			mf.NoAtime = true
 		case "noexec":
 			mf.NoExec = true
+		case "bind", "rbind":
+			// These are the same as a mount with type="bind".
 		default:
 			log.Warningf("ignoring unknown mount option %q", o)
 		}
@@ -248,6 +274,10 @@ func isSupportedMountFlag(fstype, opt string) bool {
 		ok, err := parseMountOption(opt, tmpfsAllowedData...)
 		return ok && err == nil
 	}
+	if fstype == cgroupfs.Name {
+		ok, err := parseMountOption(opt, cgroupfs.SupportedMountOptions...)
+		return ok && err == nil
+	}
 	return false
 }
 
@@ -458,9 +488,9 @@ func (m *mountHint) isSupported() bool {
 // For now enforce that all options are the same. Once bind mount is properly
 // supported, then we should ensure the master is less restrictive than the
 // container, e.g. master can be 'rw' while container mounts as 'ro'.
-func (m *mountHint) checkCompatible(mount specs.Mount) error {
+func (m *mountHint) checkCompatible(mount *specs.Mount) error {
 	// Remove options that don't affect to mount's behavior.
-	masterOpts := filterUnsupportedOptions(m.mount)
+	masterOpts := filterUnsupportedOptions(&m.mount)
 	replicaOpts := filterUnsupportedOptions(mount)
 
 	if len(masterOpts) != len(replicaOpts) {
@@ -484,7 +514,7 @@ func (m *mountHint) fileAccessType() config.FileAccessType {
 	return config.FileAccessShared
 }
 
-func filterUnsupportedOptions(mount specs.Mount) []string {
+func filterUnsupportedOptions(mount *specs.Mount) []string {
 	rv := make([]string, 0, len(mount.Options))
 	for _, o := range mount.Options {
 		if isSupportedMountFlag(mount.Type, o) {
@@ -548,7 +578,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 	return &podMountHints{mounts: mnts}, nil
 }
 
-func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
+func (p *podMountHints) findMount(mount *specs.Mount) *mountHint {
 	for _, m := range p.mounts {
 		if m.mount.Source == mount.Source {
 			return m
@@ -572,11 +602,11 @@ type containerMounter struct {
 	hints *podMountHints
 }
 
-func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints, vfs2Enabled bool) *containerMounter {
+func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *podMountHints, vfs2Enabled bool) *containerMounter {
 	return &containerMounter{
-		root:   spec.Root,
-		mounts: compileMounts(spec, vfs2Enabled),
-		fds:    fdDispenser{fds: goferFDs},
+		root:   info.spec.Root,
+		mounts: compileMounts(info.spec, info.conf, vfs2Enabled),
+		fds:    fdDispenser{fds: info.goferFDs},
 		k:      k,
 		hints:  hints,
 	}
@@ -651,7 +681,8 @@ func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Conf
 	root := mns.Root()
 	defer root.DecRef(ctx)
 
-	for _, m := range c.mounts {
+	for i := range c.mounts {
+		m := &c.mounts[i]
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
 		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
 			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
@@ -686,7 +717,7 @@ func (c *containerMounter) checkDispenser() error {
 func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
-	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, &hint.mount)
 	if err != nil {
 		return nil, err
 	}
@@ -706,7 +737,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.C
 		mf.ReadOnly = true
 	}
 
-	inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
+	inode, err := filesystem.Mount(ctx, mountDevice(&hint.mount), mf, strings.Join(opts, ","), nil)
 	if err != nil {
 		return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
 	}
@@ -768,13 +799,14 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con
 
 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m *specs.Mount) (string, []string, bool, error) {
+	specutils.MaybeConvertToBindMount(m)
+
 	var (
 		fsName     string
 		opts       []string
 		useOverlay bool
 	)
-
 	switch m.Type {
 	case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
 		fsName = m.Type
@@ -795,14 +827,20 @@ func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.M
 		opts = p9MountData(fd, c.getMountAccessType(conf, m), conf.VFS2)
 		// If configured, add overlay to all writable mounts.
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
-
+	case cgroupfs.Name:
+		fsName = m.Type
+		var err error
+		opts, err = parseAndFilterOptions(m.Options, cgroupfs.SupportedMountOptions...)
+		if err != nil {
+			return "", nil, false, err
+		}
 	default:
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
 	return fsName, opts, useOverlay, nil
 }
 
-func (c *containerMounter) getMountAccessType(conf *config.Config, mount specs.Mount) config.FileAccessType {
+func (c *containerMounter) getMountAccessType(conf *config.Config, mount *specs.Mount) config.FileAccessType {
 	if hint := c.hints.findMount(mount); hint != nil {
 		return hint.fileAccessType()
 	}
@@ -813,7 +851,7 @@ func (c *containerMounter) getMountAccessType(conf *config.Config, mount specs.M
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
 // 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m *specs.Mount) error {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
@@ -887,7 +925,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Confi
 
 // mountSharedSubmount binds mount to a previously mounted volume that is shared
 // among containers in the same pod.
-func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
+func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount *specs.Mount, source *mountHint) error {
 	if err := source.checkCompatible(mount); err != nil {
 		return err
 	}
@@ -912,7 +950,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 
 // addRestoreMount adds a mount to the MountSources map used for restoring a
 // checkpointed container.
-func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m *specs.Mount) error {
 	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
 	if err != nil {
 		return err
@@ -960,7 +998,8 @@ func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.Re
 
 	// Add submounts.
 	var tmpMounted bool
-	for _, m := range c.mounts {
+	for i := range c.mounts {
+		m := &c.mounts[i]
 		if err := c.addRestoreMount(conf, renv, m); err != nil {
 			return nil, err
 		}
@@ -975,7 +1014,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.Re
 			Type:        tmpfsvfs2.Name,
 			Destination: "/tmp",
 		}
-		if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
+		if err := c.addRestoreMount(conf, renv, &tmpMount); err != nil {
 			return nil, err
 		}
 	}
@@ -1034,7 +1073,7 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mn
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=01777"},
 		}
-		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
+		return c.mountSubmount(ctx, conf, mns, root, &tmpMount)
 
 	default:
 		return err
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index b4f12d034..09ffda628 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -244,7 +244,7 @@ func TestGetMountAccessType(t *testing.T) {
 			}
 			mounter := containerMounter{hints: podHints}
 			conf := &config.Config{FileAccessMounts: config.FileAccessShared}
-			if got := mounter.getMountAccessType(conf, specs.Mount{Source: source}); got != tst.want {
+			if got := mounter.getMountAccessType(conf, &specs.Mount{Source: source}); got != tst.want {
 				t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got)
 			}
 		})
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 774621970..10f2d3d35 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -29,10 +29,12 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/coverage"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/metric"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/refsvfs2"
@@ -216,6 +218,8 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("setting up memory usage: %v", err)
 	}
 
+	metric.CreateSentryMetrics()
+
 	// Is this a VFSv2 kernel?
 	if args.Conf.VFS2 {
 		kernel.VFS2Enabled = true
@@ -226,6 +230,33 @@ func New(args Args) (*Loader, error) {
 		vfs2.Override()
 	}
 
+	// Make host FDs stable between invocations. Host FDs must map to the exact
+	// same number when the sandbox is restored. Otherwise the wrong FD will be
+	// used.
+	info := containerInfo{}
+	newfd := startingStdioFD
+
+	for _, stdioFD := range args.StdioFDs {
+		// Check that newfd is unused to avoid clobbering over it.
+		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+			if err != nil {
+				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+			}
+			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
+		}
+
+		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
+		}
+		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+		_ = unix.Close(stdioFD)
+		newfd++
+	}
+	for _, goferFD := range args.GoferFDs {
+		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+	}
+
 	// Create kernel and platform.
 	p, err := createPlatform(args.Conf, args.Device)
 	if err != nil {
@@ -345,6 +376,7 @@ func New(args Args) (*Loader, error) {
 	if err != nil {
 		return nil, fmt.Errorf("creating init process for root container: %v", err)
 	}
+	info.procArgs = procArgs
 
 	if err := initCompatLogs(args.UserLogFD); err != nil {
 		return nil, fmt.Errorf("initializing compat logs: %v", err)
@@ -355,6 +387,9 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("creating pod mount hints: %v", err)
 	}
 
+	info.conf = args.Conf
+	info.spec = args.Spec
+
 	if kernel.VFS2Enabled {
 		// Set up host mount that will be used for imported fds.
 		hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS())
@@ -369,37 +404,6 @@ func New(args Args) (*Loader, error) {
 		k.SetHostMount(hostMount)
 	}
 
-	info := containerInfo{
-		conf:     args.Conf,
-		spec:     args.Spec,
-		procArgs: procArgs,
-	}
-
-	// Make host FDs stable between invocations. Host FDs must map to the exact
-	// same number when the sandbox is restored. Otherwise the wrong FD will be
-	// used.
-	newfd := startingStdioFD
-	for _, stdioFD := range args.StdioFDs {
-		// Check that newfd is unused to avoid clobbering over it.
-		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
-			if err != nil {
-				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
-			}
-			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
-		}
-
-		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
-		if err != nil {
-			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
-		}
-		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
-		_ = unix.Close(stdioFD)
-		newfd++
-	}
-	for _, goferFD := range args.GoferFDs {
-		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
-	}
-
 	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:          k,
@@ -491,10 +495,6 @@ func (l *Loader) Destroy() {
 	// save/restore.
 	l.k.Release()
 
-	// All sentry-created resources should have been released at this point;
-	// check for reference leaks.
-	refsvfs2.DoLeakCheck()
-
 	// In the success case, stdioFDs and goferFDs will only contain
 	// released/closed FDs that ownership has been passed over to host FDs and
 	// gofer sessions. Close them here in case of failure.
@@ -752,7 +752,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 	// Setup the child container file system.
 	l.startGoferMonitor(cid, info.goferFDs)
 
-	mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints, kernel.VFS2Enabled)
+	mntr := newContainerMounter(info, l.k, l.mountHints, kernel.VFS2Enabled)
 	if root {
 		if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
 			return nil, nil, nil, err
@@ -1000,6 +1000,15 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// consider the container exited.
 	ws := l.wait(tg)
 	*waitStatus = ws
+
+	// Check for leaks and write coverage report after the root container has
+	// exited. This guarantees that the report is written in cases where the
+	// sandbox is killed by a signal after the ContainerWait request is completed.
+	if l.root.procArgs.ContainerID == cid {
+		// All sentry-created resources should have been released at this point.
+		refsvfs2.DoLeakCheck()
+		coverage.Report()
+	}
 	return nil
 }
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 8b39bc59a..93c476971 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -439,7 +439,13 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{}, false /* vfs2Enabled */)
+			info := containerInfo{
+				conf:     conf,
+				spec:     &tc.spec,
+				goferFDs: []*fd.FD{fd.New(sandEnd)},
+			}
+
+			mntr := newContainerMounter(&info, nil, &podMountHints{}, false /* vfs2Enabled */)
 			mns, err := mntr.createMountNamespace(ctx, conf)
 			if err != nil {
 				t.Fatalf("failed to create mount namespace: %v", err)
@@ -479,7 +485,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			defer l.Destroy()
 			defer loaderCleanup()
 
-			mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints, true /* vfs2Enabled */)
+			mntr := newContainerMounter(&l.root, l.k, l.mountHints, true /* vfs2Enabled */)
 			if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil {
 				t.Fatalf("failed process hints: %v", err)
 			}
@@ -702,7 +708,12 @@ func TestRestoreEnvironment(t *testing.T) {
 			for _, ioFD := range tc.ioFDs {
 				ioFDs = append(ioFDs, fd.New(ioFD))
 			}
-			mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{}, false /* vfs2Enabled */)
+			info := containerInfo{
+				conf:     conf,
+				spec:     tc.spec,
+				goferFDs: ioFDs,
+			}
+			mntr := newContainerMounter(&info, nil, &podMountHints{}, false /* vfs2Enabled */)
 			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 9b3dacf46..c1828bd3d 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,6 +16,7 @@ package boot
 
 import (
 	"fmt"
+	"path"
 	"sort"
 	"strings"
 
@@ -29,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/devices/ttydev"
 	"gvisor.dev/gvisor/pkg/sentry/devices/tundev"
 	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/cgroupfs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
@@ -37,12 +39,14 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/verity"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/runsc/config"
+	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 func registerFilesystems(k *kernel.Kernel) error {
@@ -50,6 +54,10 @@ func registerFilesystems(k *kernel.Kernel) error {
 	creds := auth.NewRootCredentials(k.RootUserNamespace())
 	vfsObj := k.VFS()
 
+	vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
 	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserList: true,
 		// TODO(b/29356795): Users may mount this once the terminals are in a
@@ -60,6 +68,10 @@ func registerFilesystems(k *kernel.Kernel) error {
 		AllowUserMount: true,
 		AllowUserList:  true,
 	})
+	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
 	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserList: true,
 	})
@@ -79,9 +91,9 @@ func registerFilesystems(k *kernel.Kernel) error {
 		AllowUserMount: true,
 		AllowUserList:  true,
 	})
-	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-		AllowUserMount: true,
+	vfsObj.MustRegisterFilesystemType(verity.Name, &verity.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserList:  true,
+		AllowUserMount: true,
 	})
 
 	// Setup files in devtmpfs.
@@ -351,33 +363,33 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.
 
 	for i := range mounts {
 		submount := &mounts[i]
-		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options)
 		var (
 			mnt *vfs.Mount
 			err error
 		)
 
-		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
-			mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
+		if hint := c.hints.findMount(submount.mount); hint != nil && hint.isSupported() {
+			mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.mount, hint)
 			if err != nil {
-				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.mount.Destination, err)
 			}
 		} else {
 			mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
 			if err != nil {
-				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+				return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err)
 			}
 		}
 
 		if mnt != nil && mnt.ReadOnly() {
 			// Switch to ReadWrite while we setup submounts.
 			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
-				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
+				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err)
 			}
 			// Restore back to ReadOnly at the end.
 			defer func() {
 				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
-					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
+					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err))
 				}
 			}()
 		}
@@ -390,8 +402,8 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.
 }
 
 type mountAndFD struct {
-	specs.Mount
-	fd int
+	mount *specs.Mount
+	fd    int
 }
 
 func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
@@ -399,15 +411,18 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 	// undocumented assumption that FDs are dispensed in the order in which
 	// they are required by mounts.
 	var mounts []mountAndFD
-	for _, m := range c.mounts {
-		fd := -1
+	for i := range c.mounts {
+		m := &c.mounts[i]
+		specutils.MaybeConvertToBindMount(m)
+
 		// Only bind mounts use host FDs; see
 		// containerMounter.getMountNameAndOptionsVFS2.
+		fd := -1
 		if m.Type == bind {
 			fd = c.fds.remove()
 		}
 		mounts = append(mounts, mountAndFD{
-			Mount: m,
+			mount: m,
 			fd:    fd,
 		})
 	}
@@ -417,7 +432,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 
 	// Sort the mounts so that we don't place children before parents.
 	sort.Slice(mounts, func(i, j int) bool {
-		return len(mounts[i].Destination) < len(mounts[j].Destination)
+		return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination)
 	})
 
 	return mounts, nil
@@ -433,16 +448,16 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
 		return nil, nil
 	}
 
-	if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
-		return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
+	if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil {
+		return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err)
 	}
 
 	if useOverlay {
-		log.Infof("Adding overlay on top of mount %q", submount.Destination)
+		log.Infof("Adding overlay on top of mount %q", submount.mount.Destination)
 		var cleanup func()
 		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
 		if err != nil {
-			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err)
 		}
 		defer cleanup()
 		fsName = overlay.Name
@@ -454,26 +469,34 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
 	target := &vfs.PathOperation{
 		Root:  root,
 		Start: root,
-		Path:  fspath.Parse(submount.Destination),
+		Path:  fspath.Parse(submount.mount.Destination),
 	}
 	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
 	if err != nil {
-		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts)
 	}
-	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data)
 	return mnt, nil
 }
 
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
 // used for mounts.
 func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
-	fsName := m.Type
+	fsName := m.mount.Type
 	useOverlay := false
-	var data []string
-	var iopts interface{}
+	var (
+		data         []string
+		internalData interface{}
+	)
+
+	verityData, verityOpts, verityRequested, remainingMOpts, err := parseVerityMountOptions(m.mount.Options)
+	if err != nil {
+		return "", nil, false, err
+	}
+	m.mount.Options = remainingMOpts
 
 	// Find filesystem name and FS specific data field.
-	switch m.Type {
+	switch m.mount.Type {
 	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
 		// Nothing to do.
 
@@ -482,7 +505,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
 
 	case tmpfs.Name:
 		var err error
-		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
+		data, err = parseAndFilterOptions(m.mount.Options, tmpfsAllowedData...)
 		if err != nil {
 			return "", nil, false, err
 		}
@@ -494,28 +517,35 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
 			// but unlikely to be correct in this context.
 			return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
 		}
-		data = p9MountData(m.fd, c.getMountAccessType(conf, m.Mount), true /* vfs2 */)
-		iopts = gofer.InternalFilesystemOptions{
-			UniqueID: m.Destination,
+		data = p9MountData(m.fd, c.getMountAccessType(conf, m.mount), true /* vfs2 */)
+		internalData = gofer.InternalFilesystemOptions{
+			UniqueID: m.mount.Destination,
 		}
 
 		// If configured, add overlay to all writable mounts.
-		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+		useOverlay = conf.Overlay && !mountFlags(m.mount.Options).ReadOnly
+
+	case cgroupfs.Name:
+		var err error
+		data, err = parseAndFilterOptions(m.mount.Options, cgroupfs.SupportedMountOptions...)
+		if err != nil {
+			return "", nil, false, err
+		}
 
 	default:
-		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+		log.Warningf("ignoring unknown filesystem type %q", m.mount.Type)
 		return "", nil, false, nil
 	}
 
 	opts := &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
 			Data:         strings.Join(data, ","),
-			InternalData: iopts,
+			InternalData: internalData,
 		},
 		InternalMount: true,
 	}
 
-	for _, o := range m.Options {
+	for _, o := range m.mount.Options {
 		switch o {
 		case "rw":
 			opts.ReadOnly = false
@@ -525,14 +555,82 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
 			opts.Flags.NoATime = true
 		case "noexec":
 			opts.Flags.NoExec = true
+		case "bind", "rbind":
+			// These are the same as a mount with type="bind".
 		default:
 			log.Warningf("ignoring unknown mount option %q", o)
 		}
 	}
 
+	if verityRequested {
+		verityData = verityData + "root_name=" + path.Base(m.mount.Destination)
+		verityOpts.LowerName = fsName
+		verityOpts.LowerGetFSOptions = opts.GetFilesystemOptions
+		fsName = verity.Name
+		opts = &vfs.MountOptions{
+			GetFilesystemOptions: vfs.GetFilesystemOptions{
+				Data:         verityData,
+				InternalData: verityOpts,
+			},
+			InternalMount: true,
+		}
+	}
+
 	return fsName, opts, useOverlay, nil
 }
 
+func parseKeyValue(s string) (string, string, bool) {
+	tokens := strings.SplitN(s, "=", 2)
+	if len(tokens) < 2 {
+		return "", "", false
+	}
+	return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true
+}
+
+// parseAndFilterOptions scans the provided mount options for verity-related
+// mount options. It returns the parsed set of verity mount options, as well as
+// the filtered set of mount options unrelated to verity.
+func parseVerityMountOptions(mopts []string) (string, verity.InternalFilesystemOptions, bool, []string, error) {
+	nonVerity := []string{}
+	found := false
+	var rootHash string
+	verityOpts := verity.InternalFilesystemOptions{
+		Action: verity.PanicOnViolation,
+	}
+	for _, o := range mopts {
+		if !strings.HasPrefix(o, "verity.") {
+			nonVerity = append(nonVerity, o)
+			continue
+		}
+
+		k, v, ok := parseKeyValue(o)
+		if !ok {
+			return "", verityOpts, found, nonVerity, fmt.Errorf("invalid verity mount option with no value: %q", o)
+		}
+
+		found = true
+		switch k {
+		case "verity.roothash":
+			rootHash = v
+		case "verity.action":
+			switch v {
+			case "error":
+				verityOpts.Action = verity.ErrorOnViolation
+			case "panic":
+				verityOpts.Action = verity.PanicOnViolation
+			default:
+				log.Warningf("Invalid verity action %q", v)
+				verityOpts.Action = verity.PanicOnViolation
+			}
+		default:
+			return "", verityOpts, found, nonVerity, fmt.Errorf("unknown verity mount option: %q", k)
+		}
+	}
+	verityOpts.AllowRuntimeEnable = len(rootHash) == 0
+	verityData := "root_hash=" + rootHash + ","
+	return verityData, verityOpts, found, nonVerity, nil
+}
+
 // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
 // the host /tmp, but this is a nice optimization, and fixes some apps that call
@@ -594,7 +692,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=01777"},
 		}
-		_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+		_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{mount: &tmpMount})
 		return err
 
 	case syserror.ENOTDIR:
@@ -633,7 +731,7 @@ func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Cre
 func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
 	// Map mount type to filesystem name, and parse out the options that we are
 	// capable of dealing with.
-	mntFD := &mountAndFD{Mount: hint.mount}
+	mntFD := &mountAndFD{mount: &hint.mount}
 	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
 	if err != nil {
 		return nil, err
@@ -643,11 +741,11 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf
 	}
 
 	if useOverlay {
-		log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+		log.Infof("Adding overlay on top of shared mount %q", mntFD.mount.Destination)
 		var cleanup func()
 		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
 		if err != nil {
-			return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+			return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.mount.Destination, err)
 		}
 		defer cleanup()
 		fsName = overlay.Name
@@ -658,14 +756,14 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf
 
 // mountSharedSubmount binds mount to a previously mounted volume that is shared
 // among containers in the same pod.
-func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount *specs.Mount, source *mountHint) (*vfs.Mount, error) {
 	if err := source.checkCompatible(mount); err != nil {
 		return nil, err
 	}
 
 	// Ignore data and useOverlay because these were already applied to
 	// the master mount.
-	_, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+	_, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{mount: mount})
 	if err != nil {
 		return nil, err
 	}
@@ -718,7 +816,7 @@ func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Crede
 
 // configureRestore returns an updated context.Context including filesystem
 // state used by restore defined by conf.
-func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) {
+func (c *containerMounter) configureRestore(ctx context.Context) (context.Context, error) {
 	fdmap := make(map[string]int)
 	fdmap["/"] = c.fds.remove()
 	mounts, err := c.prepareMountsVFS2()
@@ -728,7 +826,7 @@ func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Co
 	for i := range c.mounts {
 		submount := &mounts[i]
 		if submount.fd >= 0 {
-			fdmap[submount.Destination] = submount.fd
+			fdmap[submount.mount.Destination] = submount.fd
 		}
 	}
 	return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 438b7ef3e..335e46499 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -40,23 +40,24 @@ const (
 	cgroupRoot = "/sys/fs/cgroup"
 )
 
-var controllers = map[string]config{
-	"blkio":    {ctrlr: &blockIO{}},
-	"cpu":      {ctrlr: &cpu{}},
-	"cpuset":   {ctrlr: &cpuSet{}},
-	"hugetlb":  {ctrlr: &hugeTLB{}, optional: true},
-	"memory":   {ctrlr: &memory{}},
-	"net_cls":  {ctrlr: &networkClass{}},
-	"net_prio": {ctrlr: &networkPrio{}},
-	"pids":     {ctrlr: &pids{}},
+var controllers = map[string]controller{
+	"blkio":    &blockIO{},
+	"cpu":      &cpu{},
+	"cpuset":   &cpuSet{},
+	"hugetlb":  &hugeTLB{},
+	"memory":   &memory{},
+	"net_cls":  &networkClass{},
+	"net_prio": &networkPrio{},
+	"pids":     &pids{},
 
 	// These controllers either don't have anything in the OCI spec or is
 	// irrelevant for a sandbox.
-	"devices":    {ctrlr: &noop{}},
-	"freezer":    {ctrlr: &noop{}},
-	"perf_event": {ctrlr: &noop{}},
-	"rdma":       {ctrlr: &noop{}, optional: true},
-	"systemd":    {ctrlr: &noop{}},
+	"cpuacct":    &noop{},
+	"devices":    &noop{},
+	"freezer":    &noop{},
+	"perf_event": &noop{},
+	"rdma":       &noop{isOptional: true},
+	"systemd":    &noop{},
 }
 
 // IsOnlyV2 checks whether cgroups V2 is enabled and V1 is not.
@@ -201,31 +202,26 @@ func countCpuset(cpuset string) (int, error) {
 	return count, nil
 }
 
-// LoadPaths loads cgroup paths for given 'pid', may be set to 'self'.
-func LoadPaths(pid string) (map[string]string, error) {
-	f, err := os.Open(filepath.Join("/proc", pid, "cgroup"))
+// loadPaths loads cgroup paths for given 'pid', may be set to 'self'.
+func loadPaths(pid string) (map[string]string, error) {
+	procCgroup, err := os.Open(filepath.Join("/proc", pid, "cgroup"))
 	if err != nil {
 		return nil, err
 	}
-	defer f.Close()
+	defer procCgroup.Close()
 
-	return loadPathsHelper(f)
-}
-
-func loadPathsHelper(cgroup io.Reader) (map[string]string, error) {
-	// For nested containers, in /proc/self/cgroup we see paths from host,
-	// which don't exist in container, so recover the container paths here by
-	// double-checking with /proc/pid/mountinfo
-	mountinfo, err := os.Open("/proc/self/mountinfo")
+	// Load mountinfo for the current process, because it's where cgroups is
+	// being accessed from.
+	mountinfo, err := os.Open(filepath.Join("/proc/self/mountinfo"))
 	if err != nil {
 		return nil, err
 	}
 	defer mountinfo.Close()
 
-	return loadPathsHelperWithMountinfo(cgroup, mountinfo)
+	return loadPathsHelper(procCgroup, mountinfo)
 }
 
-func loadPathsHelperWithMountinfo(cgroup, mountinfo io.Reader) (map[string]string, error) {
+func loadPathsHelper(cgroup, mountinfo io.Reader) (map[string]string, error) {
 	paths := make(map[string]string)
 
 	scanner := bufio.NewScanner(cgroup)
@@ -242,34 +238,51 @@ func loadPathsHelperWithMountinfo(cgroup, mountinfo io.Reader) (map[string]strin
 		for _, ctrlr := range strings.Split(tokens[1], ",") {
 			// Remove prefix for cgroups with no controller, eg. systemd.
 			ctrlr = strings.TrimPrefix(ctrlr, "name=")
-			paths[ctrlr] = tokens[2]
+			// Discard unknown controllers.
+			if _, ok := controllers[ctrlr]; ok {
+				paths[ctrlr] = tokens[2]
+			}
 		}
 	}
 	if err := scanner.Err(); err != nil {
 		return nil, err
 	}
 
-	mfScanner := bufio.NewScanner(mountinfo)
-	for mfScanner.Scan() {
-		txt := mfScanner.Text()
-		fields := strings.Fields(txt)
+	// For nested containers, in /proc/[pid]/cgroup we see paths from host,
+	// which don't exist in container, so recover the container paths here by
+	// double-checking with /proc/[pid]/mountinfo
+	mountScanner := bufio.NewScanner(mountinfo)
+	for mountScanner.Scan() {
+		// Format: ID parent major:minor root mount-point options opt-fields - fs-type source super-options
+		// Example: 39 32 0:34 / /sys/fs/cgroup/devices rw,noexec shared:18 - cgroup cgroup rw,devices
+		fields := strings.Fields(mountScanner.Text())
 		if len(fields) < 9 || fields[len(fields)-3] != "cgroup" {
+			// Skip mounts that are not cgroup mounts.
 			continue
 		}
-		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+		// Cgroup controller type is in the super-options field.
+		superOptions := strings.Split(fields[len(fields)-1], ",")
+		for _, opt := range superOptions {
 			// Remove prefix for cgroups with no controller, eg. systemd.
 			opt = strings.TrimPrefix(opt, "name=")
+
+			// Only considers cgroup controllers that are registered, and skip other
+			// irrelevant options, e.g. rw.
 			if cgroupPath, ok := paths[opt]; ok {
-				root := fields[3]
-				relCgroupPath, err := filepath.Rel(root, cgroupPath)
-				if err != nil {
-					return nil, err
+				rootDir := fields[3]
+				if rootDir != "/" {
+					// When cgroup is in submount, remove repeated path components from
+					// cgroup path to avoid duplicating them.
+					relCgroupPath, err := filepath.Rel(rootDir, cgroupPath)
+					if err != nil {
+						return nil, err
+					}
+					paths[opt] = relCgroupPath
 				}
-				paths[opt] = relCgroupPath
 			}
 		}
 	}
-	if err := mfScanner.Err(); err != nil {
+	if err := mountScanner.Err(); err != nil {
 		return nil, err
 	}
 
@@ -279,37 +292,50 @@ func loadPathsHelperWithMountinfo(cgroup, mountinfo io.Reader) (map[string]strin
 // Cgroup represents a group inside all controllers. For example:
 //   Name='/foo/bar' maps to /sys/fs/cgroup/<controller>/foo/bar on
 //   all controllers.
+//
+// If Name is relative, it uses the parent cgroup path to determine the
+// location. For example:
+//   Name='foo/bar' and Parent[ctrl]="/user.slice", then it will map to
+//   /sys/fs/cgroup/<ctrl>/user.slice/foo/bar
 type Cgroup struct {
 	Name    string            `json:"name"`
 	Parents map[string]string `json:"parents"`
 	Own     map[string]bool   `json:"own"`
 }
 
-// New creates a new Cgroup instance if the spec includes a cgroup path.
-// Returns nil otherwise.
-func New(spec *specs.Spec) (*Cgroup, error) {
+// NewFromSpec creates a new Cgroup instance if the spec includes a cgroup path.
+// Returns nil otherwise. Cgroup paths are loaded based on the current process.
+func NewFromSpec(spec *specs.Spec) (*Cgroup, error) {
 	if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
 		return nil, nil
 	}
-	return NewFromPath(spec.Linux.CgroupsPath)
+	return new("self", spec.Linux.CgroupsPath)
 }
 
-// NewFromPath creates a new Cgroup instance.
-func NewFromPath(cgroupsPath string) (*Cgroup, error) {
+// NewFromPid loads cgroup for the given process.
+func NewFromPid(pid int) (*Cgroup, error) {
+	return new(strconv.Itoa(pid), "")
+}
+
+func new(pid, cgroupsPath string) (*Cgroup, error) {
 	var parents map[string]string
+
+	// If path is relative, load cgroup paths for the process to build the
+	// relative paths.
 	if !filepath.IsAbs(cgroupsPath) {
 		var err error
-		parents, err = LoadPaths("self")
+		parents, err = loadPaths(pid)
 		if err != nil {
 			return nil, fmt.Errorf("finding current cgroups: %w", err)
 		}
 	}
-	own := make(map[string]bool)
-	return &Cgroup{
+	cg := &Cgroup{
 		Name:    cgroupsPath,
 		Parents: parents,
-		Own:     own,
-	}, nil
+		Own:     make(map[string]bool),
+	}
+	log.Debugf("New cgroup for pid: %s, %+v", pid, cg)
+	return cg, nil
 }
 
 // Install creates and configures cgroups according to 'res'. If cgroup path
@@ -323,8 +349,8 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 	clean := cleanup.Make(func() { _ = c.Uninstall() })
 	defer clean.Clean()
 
-	for key, cfg := range controllers {
-		path := c.makePath(key)
+	for key, ctrlr := range controllers {
+		path := c.MakePath(key)
 		if _, err := os.Stat(path); err == nil {
 			// If cgroup has already been created; it has been setup by caller. Don't
 			// make any changes to configuration, just join when sandbox/gofer starts.
@@ -336,13 +362,16 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 		c.Own[key] = true
 
 		if err := os.MkdirAll(path, 0755); err != nil {
-			if cfg.optional && errors.Is(err, unix.EROFS) {
+			if ctrlr.optional() && errors.Is(err, unix.EROFS) {
+				if err := ctrlr.skip(res); err != nil {
+					return err
+				}
 				log.Infof("Skipping cgroup %q", key)
 				continue
 			}
 			return err
 		}
-		if err := cfg.ctrlr.set(res, path); err != nil {
+		if err := ctrlr.set(res, path); err != nil {
 			return err
 		}
 	}
@@ -359,7 +388,7 @@ func (c *Cgroup) Uninstall() error {
 			// cgroup is managed by caller, don't touch it.
 			continue
 		}
-		path := c.makePath(key)
+		path := c.MakePath(key)
 		log.Debugf("Removing cgroup controller for key=%q path=%q", key, path)
 
 		// If we try to remove the cgroup too soon after killing the
@@ -387,7 +416,7 @@ func (c *Cgroup) Uninstall() error {
 func (c *Cgroup) Join() (func(), error) {
 	// First save the current state so it can be restored.
 	undo := func() {}
-	paths, err := LoadPaths("self")
+	paths, err := loadPaths("self")
 	if err != nil {
 		return undo, err
 	}
@@ -414,14 +443,13 @@ func (c *Cgroup) Join() (func(), error) {
 	}
 
 	// Now join the cgroups.
-	for key, cfg := range controllers {
-		path := c.makePath(key)
+	for key, ctrlr := range controllers {
+		path := c.MakePath(key)
 		log.Debugf("Joining cgroup %q", path)
-		// Writing the value 0 to a cgroup.procs file causes the
-		// writing process to be moved to the corresponding cgroup.
-		// - cgroups(7).
+		// Writing the value 0 to a cgroup.procs file causes the writing process to
+		// be moved to the corresponding cgroup - cgroups(7).
 		if err := setValue(path, "cgroup.procs", "0"); err != nil {
-			if cfg.optional && os.IsNotExist(err) {
+			if ctrlr.optional() && os.IsNotExist(err) {
 				continue
 			}
 			return undo, err
@@ -432,7 +460,7 @@ func (c *Cgroup) Join() (func(), error) {
 
 // CPUQuota returns the CFS CPU quota.
 func (c *Cgroup) CPUQuota() (float64, error) {
-	path := c.makePath("cpu")
+	path := c.MakePath("cpu")
 	quota, err := getInt(path, "cpu.cfs_quota_us")
 	if err != nil {
 		return -1, err
@@ -449,7 +477,7 @@ func (c *Cgroup) CPUQuota() (float64, error) {
 
 // CPUUsage returns the total CPU usage of the cgroup.
 func (c *Cgroup) CPUUsage() (uint64, error) {
-	path := c.makePath("cpuacct")
+	path := c.MakePath("cpuacct")
 	usage, err := getValue(path, "cpuacct.usage")
 	if err != nil {
 		return 0, err
@@ -459,7 +487,7 @@ func (c *Cgroup) CPUUsage() (uint64, error) {
 
 // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
 func (c *Cgroup) NumCPU() (int, error) {
-	path := c.makePath("cpuset")
+	path := c.MakePath("cpuset")
 	cpuset, err := getValue(path, "cpuset.cpus")
 	if err != nil {
 		return 0, err
@@ -469,7 +497,7 @@ func (c *Cgroup) NumCPU() (int, error) {
 
 // MemoryLimit returns the memory limit.
 func (c *Cgroup) MemoryLimit() (uint64, error) {
-	path := c.makePath("memory")
+	path := c.MakePath("memory")
 	limStr, err := getValue(path, "memory.limit_in_bytes")
 	if err != nil {
 		return 0, err
@@ -477,7 +505,8 @@ func (c *Cgroup) MemoryLimit() (uint64, error) {
 	return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64)
 }
 
-func (c *Cgroup) makePath(controllerName string) string {
+// MakePath builds a path to the given controller.
+func (c *Cgroup) MakePath(controllerName string) string {
 	path := c.Name
 	if parent, ok := c.Parents[controllerName]; ok {
 		path = filepath.Join(parent, c.Name)
@@ -485,22 +514,48 @@ func (c *Cgroup) makePath(controllerName string) string {
 	return filepath.Join(cgroupRoot, controllerName, path)
 }
 
-type config struct {
-	ctrlr    controller
-	optional bool
-}
-
 type controller interface {
+	// optional controllers don't fail if not found.
+	optional() bool
+	// set applies resource limits to controller.
 	set(*specs.LinuxResources, string) error
+	// skip is called when controller is not found to check if it can be safely
+	// skipped or not based on the spec.
+	skip(*specs.LinuxResources) error
+}
+
+type noop struct {
+	isOptional bool
 }
 
-type noop struct{}
+func (n *noop) optional() bool {
+	return n.isOptional
+}
 
 func (*noop) set(*specs.LinuxResources, string) error {
 	return nil
 }
 
-type memory struct{}
+func (n *noop) skip(*specs.LinuxResources) error {
+	if !n.isOptional {
+		panic("cgroup controller is not optional")
+	}
+	return nil
+}
+
+type mandatory struct{}
+
+func (*mandatory) optional() bool {
+	return false
+}
+
+func (*mandatory) skip(*specs.LinuxResources) error {
+	panic("cgroup controller is not optional")
+}
+
+type memory struct {
+	mandatory
+}
 
 func (*memory) set(spec *specs.LinuxResources, path string) error {
 	if spec == nil || spec.Memory == nil {
@@ -533,7 +588,9 @@ func (*memory) set(spec *specs.LinuxResources, path string) error {
 	return nil
 }
 
-type cpu struct{}
+type cpu struct {
+	mandatory
+}
 
 func (*cpu) set(spec *specs.LinuxResources, path string) error {
 	if spec == nil || spec.CPU == nil {
@@ -554,7 +611,9 @@ func (*cpu) set(spec *specs.LinuxResources, path string) error {
 	return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime)
 }
 
-type cpuSet struct{}
+type cpuSet struct {
+	mandatory
+}
 
 func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
 	// cpuset.cpus and mems are required fields, but are not set on a new cgroup.
@@ -576,7 +635,9 @@ func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
 	return setValue(path, "cpuset.mems", spec.CPU.Mems)
 }
 
-type blockIO struct{}
+type blockIO struct {
+	mandatory
+}
 
 func (*blockIO) set(spec *specs.LinuxResources, path string) error {
 	if spec == nil || spec.BlockIO == nil {
@@ -628,6 +689,10 @@ func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error {
 
 type networkClass struct{}
 
+func (*networkClass) optional() bool {
+	return true
+}
+
 func (*networkClass) set(spec *specs.LinuxResources, path string) error {
 	if spec == nil || spec.Network == nil {
 		return nil
@@ -635,8 +700,19 @@ func (*networkClass) set(spec *specs.LinuxResources, path string) error {
 	return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID)
 }
 
+func (*networkClass) skip(spec *specs.LinuxResources) error {
+	if spec != nil && spec.Network != nil && spec.Network.ClassID != nil {
+		return fmt.Errorf("Network.ClassID set but net_cls cgroup controller not found")
+	}
+	return nil
+}
+
 type networkPrio struct{}
 
+func (*networkPrio) optional() bool {
+	return true
+}
+
 func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
 	if spec == nil || spec.Network == nil {
 		return nil
@@ -650,7 +726,16 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
 	return nil
 }
 
-type pids struct{}
+func (*networkPrio) skip(spec *specs.LinuxResources) error {
+	if spec != nil && spec.Network != nil && len(spec.Network.Priorities) > 0 {
+		return fmt.Errorf("Network.Priorities set but net_prio cgroup controller not found")
+	}
+	return nil
+}
+
+type pids struct {
+	mandatory
+}
 
 func (*pids) set(spec *specs.LinuxResources, path string) error {
 	if spec == nil || spec.Pids == nil || spec.Pids.Limit <= 0 {
@@ -662,6 +747,17 @@ func (*pids) set(spec *specs.LinuxResources, path string) error {
 
 type hugeTLB struct{}
 
+func (*hugeTLB) optional() bool {
+	return true
+}
+
+func (*hugeTLB) skip(spec *specs.LinuxResources) error {
+	if spec != nil && len(spec.HugepageLimits) > 0 {
+		return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found")
+	}
+	return nil
+}
+
 func (*hugeTLB) set(spec *specs.LinuxResources, path string) error {
 	if spec == nil {
 		return nil
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index 48d71cfa6..02cadeef4 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -43,19 +43,19 @@ var debianMountinfo = `
 `
 
 var dindMountinfo = `
-1305 1304 0:64 / /sys/fs/cgroup rw - tmpfs tmpfs rw,mode=755
-1306 1305 0:32 /docker/136 /sys/fs/cgroup/systemd ro master:11 - cgroup cgroup rw,xattr,name=systemd
-1307 1305 0:36 /docker/136 /sys/fs/cgroup/cpu,cpuacct ro master:16 - cgroup cgroup rw,cpu,cpuacct
-1308 1305 0:37 /docker/136 /sys/fs/cgroup/freezer ro master:17 - cgroup cgroup rw,freezer
-1309 1305 0:38 /docker/136 /sys/fs/cgroup/hugetlb ro master:18 - cgroup cgroup rw,hugetlb
-1310 1305 0:39 /docker/136 /sys/fs/cgroup/cpuset ro master:19 - cgroup cgroup rw,cpuset
-1311 1305 0:40 /docker/136 /sys/fs/cgroup/net_cls,net_prio ro master:20 - cgroup cgroup rw,net_cls,net_prio
-1312 1305 0:41 /docker/136 /sys/fs/cgroup/pids ro master:21 - cgroup cgroup rw,pids
-1313 1305 0:42 /docker/136 /sys/fs/cgroup/perf_event ro master:22 - cgroup cgroup rw,perf_event
-1314 1305 0:43 /docker/136 /sys/fs/cgroup/memory ro master:23 - cgroup cgroup rw,memory
-1316 1305 0:44 /docker/136 /sys/fs/cgroup/blkio ro master:24 - cgroup cgroup rw,blkio
-1317 1305 0:45 /docker/136 /sys/fs/cgroup/devices ro master:25 - cgroup cgroup rw,devices
-1318 1305 0:46 / /sys/fs/cgroup/rdma ro master:26 - cgroup cgroup rw,rdma
+05 04 0:64 / /sys/fs/cgroup rw - tmpfs tmpfs rw,mode=755
+06 05 0:32 /docker/136 /sys/fs/cgroup/systemd ro master:11 - cgroup cgroup rw,xattr,name=systemd
+07 05 0:36 /docker/136 /sys/fs/cgroup/cpu,cpuacct ro master:16 - cgroup cgroup rw,cpu,cpuacct
+08 05 0:37 /docker/136 /sys/fs/cgroup/freezer ro master:17 - cgroup cgroup rw,freezer
+09 05 0:38 /docker/136 /sys/fs/cgroup/hugetlb ro master:18 - cgroup cgroup rw,hugetlb
+10 05 0:39 /docker/136 /sys/fs/cgroup/cpuset ro master:19 - cgroup cgroup rw,cpuset
+11 05 0:40 /docker/136 /sys/fs/cgroup/net_cls,net_prio ro master:20 - cgroup cgroup rw,net_cls,net_prio
+12 05 0:41 /docker/136 /sys/fs/cgroup/pids ro master:21 - cgroup cgroup rw,pids
+13 05 0:42 /docker/136 /sys/fs/cgroup/perf_event ro master:22 - cgroup cgroup rw,perf_event
+14 05 0:43 /docker/136 /sys/fs/cgroup/memory ro master:23 - cgroup cgroup rw,memory
+16 05 0:44 /docker/136 /sys/fs/cgroup/blkio ro master:24 - cgroup cgroup rw,blkio
+17 05 0:45 /docker/136 /sys/fs/cgroup/devices ro master:25 - cgroup cgroup rw,devices
+18 05 0:46 / /sys/fs/cgroup/rdma ro master:26 - cgroup cgroup rw,rdma
 `
 
 func TestUninstallEnoent(t *testing.T) {
@@ -693,36 +693,42 @@ func TestLoadPaths(t *testing.T) {
 		err       string
 	}{
 		{
-			name:      "abs-path-unknown-controller",
-			cgroups:   "0:ctr:/path",
+			name:      "empty",
 			mountinfo: debianMountinfo,
-			want:      map[string]string{"ctr": "/path"},
+		},
+		{
+			name:      "abs-path",
+			cgroups:   "0:cpu:/path",
+			mountinfo: debianMountinfo,
+			want:      map[string]string{"cpu": "/path"},
 		},
 		{
 			name:      "rel-path",
-			cgroups:   "0:ctr:rel-path",
+			cgroups:   "0:cpu:rel-path",
 			mountinfo: debianMountinfo,
-			want:      map[string]string{"ctr": "rel-path"},
+			want:      map[string]string{"cpu": "rel-path"},
 		},
 		{
 			name:      "non-controller",
 			cgroups:   "0:name=systemd:/path",
 			mountinfo: debianMountinfo,
-			want:      map[string]string{"systemd": "path"},
+			want:      map[string]string{"systemd": "/path"},
 		},
 		{
-			name:      "empty",
+			name:      "unknown-controller",
+			cgroups:   "0:ctr:/path",
 			mountinfo: debianMountinfo,
+			want:      map[string]string{},
 		},
 		{
 			name: "multiple",
-			cgroups: "0:ctr0:/path0\n" +
-				"1:ctr1:/path1\n" +
+			cgroups: "0:cpu:/path0\n" +
+				"1:memory:/path1\n" +
 				"2::/empty\n",
 			mountinfo: debianMountinfo,
 			want: map[string]string{
-				"ctr0": "/path0",
-				"ctr1": "/path1",
+				"cpu":    "/path0",
+				"memory": "/path1",
 			},
 		},
 		{
@@ -747,10 +753,10 @@ func TestLoadPaths(t *testing.T) {
 		},
 		{
 			name: "nested-cgroup",
-			cgroups: `9:memory:/docker/136
-2:cpu,cpuacct:/docker/136
-1:name=systemd:/docker/136
-0::/system.slice/containerd.service`,
+			cgroups: "9:memory:/docker/136\n" +
+				"2:cpu,cpuacct:/docker/136\n" +
+				"1:name=systemd:/docker/136\n" +
+				"0::/system.slice/containerd.service\n",
 			mountinfo: dindMountinfo,
 			// we want relative path to /sys/fs/cgroup inside the nested container.
 			// Subcroup inside the container will be created at /sys/fs/cgroup/cpu
@@ -781,15 +787,15 @@ func TestLoadPaths(t *testing.T) {
 		},
 		{
 			name:      "invalid-rel-path-in-proc-cgroup",
-			cgroups:   "9:memory:./invalid",
+			cgroups:   "9:memory:invalid",
 			mountinfo: dindMountinfo,
-			err:       "can't make ./invalid relative to /docker/136",
+			err:       "can't make invalid relative to /docker/136",
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			r := strings.NewReader(tc.cgroups)
 			mountinfo := strings.NewReader(tc.mountinfo)
-			got, err := loadPathsHelperWithMountinfo(r, mountinfo)
+			got, err := loadPathsHelper(r, mountinfo)
 			if len(tc.err) == 0 {
 				if err != nil {
 					t.Fatalf("Unexpected error: %v", err)
@@ -813,3 +819,47 @@ func TestLoadPaths(t *testing.T) {
 		})
 	}
 }
+
+func TestOptional(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		ctrlr controller
+		spec  *specs.LinuxResources
+		err   string
+	}{
+		{
+			name:  "net-cls",
+			ctrlr: &networkClass{},
+			spec:  &specs.LinuxResources{Network: &specs.LinuxNetwork{ClassID: uint32Ptr(1)}},
+			err:   "Network.ClassID set but net_cls cgroup controller not found",
+		},
+		{
+			name:  "net-prio",
+			ctrlr: &networkPrio{},
+			spec: &specs.LinuxResources{Network: &specs.LinuxNetwork{
+				Priorities: []specs.LinuxInterfacePriority{
+					{Name: "foo", Priority: 1},
+				},
+			}},
+			err: "Network.Priorities set but net_prio cgroup controller not found",
+		},
+		{
+			name:  "hugetlb",
+			ctrlr: &hugeTLB{},
+			spec: &specs.LinuxResources{HugepageLimits: []specs.LinuxHugepageLimit{
+				{Pagesize: "1", Limit: 2},
+			}},
+			err: "HugepageLimits set but hugetlb cgroup controller not found",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			err := tc.ctrlr.skip(tc.spec)
+			if err == nil {
+				t.Fatalf("ctrlr.skip() didn't fail")
+			}
+			if !strings.Contains(err.Error(), tc.err) {
+				t.Errorf("ctrlr.skip() want: *%s*, got: %q", tc.err, err)
+			}
+		})
+	}
+}
diff --git a/runsc/cli/BUILD b/runsc/cli/BUILD
index f1e3cce68..360e3cea6 100644
--- a/runsc/cli/BUILD
+++ b/runsc/cli/BUILD
@@ -10,8 +10,10 @@ go_library(
         "//runsc:__pkg__",
     ],
     deps = [
+        "//pkg/coverage",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sentry/platform",
         "//runsc/cmd",
         "//runsc/config",
diff --git a/runsc/cli/main.go b/runsc/cli/main.go
index a3c515f4b..76184cd9c 100644
--- a/runsc/cli/main.go
+++ b/runsc/cli/main.go
@@ -27,8 +27,10 @@ import (
 
 	"github.com/google/subcommands"
 	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/coverage"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/runsc/cmd"
 	"gvisor.dev/gvisor/runsc/config"
@@ -50,6 +52,7 @@ var (
 	logFD      = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
 	debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
 	panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
+	coverageFD = flag.Int("coverage-fd", -1, "file descriptor to write Go coverage output.")
 )
 
 // Main is the main entrypoint.
@@ -86,6 +89,7 @@ func Main(version string) {
 	subcommands.Register(new(cmd.Symbolize), "")
 	subcommands.Register(new(cmd.Wait), "")
 	subcommands.Register(new(cmd.Mitigate), "")
+	subcommands.Register(new(cmd.VerityPrepare), "")
 
 	// Register internal commands with the internal group name. This causes
 	// them to be sorted below the user-facing commands with empty group.
@@ -204,6 +208,10 @@ func Main(version string) {
 	} else if conf.AlsoLogToStderr {
 		e = &log.MultiEmitter{e, newEmitter(conf.DebugLogFormat, os.Stderr)}
 	}
+	if *coverageFD >= 0 {
+		f := os.NewFile(uintptr(*coverageFD), "coverage file")
+		coverage.EnableReport(f)
+	}
 
 	log.SetTarget(e)
 
@@ -233,6 +241,9 @@ func Main(version string) {
 	// Call the subcommand and pass in the configuration.
 	var ws unix.WaitStatus
 	subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+	// Check for leaks and write coverage report before os.Exit().
+	refsvfs2.DoLeakCheck()
+	coverage.Report()
 	if subcmdCode == subcommands.ExitSuccess {
 		log.Infof("Exiting with status: %v", ws)
 		if ws.Signaled() {
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 2c3b4058b..39c8ff603 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -23,6 +23,7 @@ go_library(
         "kill.go",
         "list.go",
         "mitigate.go",
+        "mitigate_extras.go",
         "path.go",
         "pause.go",
         "ps.go",
@@ -35,6 +36,7 @@ go_library(
         "statefile.go",
         "symbolize.go",
         "syscalls.go",
+        "verity_prepare.go",
         "wait.go",
     ],
     visibility = [
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 455c57692..5485db149 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -126,9 +126,8 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		Hostname: hostname,
 	}
 
-	specutils.LogSpec(spec)
-
 	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
+
 	if conf.Network == config.NetworkNone {
 		addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace})
 
@@ -154,55 +153,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 		}
 	}
 
-	out, err := json.Marshal(spec)
-	if err != nil {
-		return Errorf("Error to marshal spec: %v", err)
-	}
-	tmpDir, err := ioutil.TempDir("", "runsc-do")
-	if err != nil {
-		return Errorf("Error to create tmp dir: %v", err)
-	}
-	defer os.RemoveAll(tmpDir)
-
-	log.Infof("Changing configuration RootDir to %q", tmpDir)
-	conf.RootDir = tmpDir
-
-	cfgPath := filepath.Join(tmpDir, "config.json")
-	if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil {
-		return Errorf("Error write spec: %v", err)
-	}
-
-	containerArgs := container.Args{
-		ID:        cid,
-		Spec:      spec,
-		BundleDir: tmpDir,
-		Attached:  true,
-	}
-	ct, err := container.New(conf, containerArgs)
-	if err != nil {
-		return Errorf("creating container: %v", err)
-	}
-	defer ct.Destroy()
-
-	if err := ct.Start(conf); err != nil {
-		return Errorf("starting container: %v", err)
-	}
-
-	// Forward signals to init in the container. Thus if we get SIGINT from
-	// ^C, the container gracefully exit, and we can clean up.
-	//
-	// N.B. There is a still a window before this where a signal may kill
-	// this process, skipping cleanup.
-	stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */)
-	defer stopForwarding()
-
-	ws, err := ct.Wait()
-	if err != nil {
-		return Errorf("waiting for container: %v", err)
-	}
-
-	*waitStatus = ws
-	return subcommands.ExitSuccess
+	return startContainerAndWait(spec, conf, cid, waitStatus)
 }
 
 func addNamespace(spec *specs.Spec, ns specs.LinuxNamespace) {
@@ -397,3 +348,58 @@ func calculatePeerIP(ip string) (string, error) {
 	}
 	return fmt.Sprintf("%s.%s.%s.%d", parts[0], parts[1], parts[2], n), nil
 }
+
+func startContainerAndWait(spec *specs.Spec, conf *config.Config, cid string, waitStatus *unix.WaitStatus) subcommands.ExitStatus {
+	specutils.LogSpec(spec)
+
+	out, err := json.Marshal(spec)
+	if err != nil {
+		return Errorf("Error to marshal spec: %v", err)
+	}
+	tmpDir, err := ioutil.TempDir("", "runsc-do")
+	if err != nil {
+		return Errorf("Error to create tmp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	log.Infof("Changing configuration RootDir to %q", tmpDir)
+	conf.RootDir = tmpDir
+
+	cfgPath := filepath.Join(tmpDir, "config.json")
+	if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil {
+		return Errorf("Error write spec: %v", err)
+	}
+
+	containerArgs := container.Args{
+		ID:        cid,
+		Spec:      spec,
+		BundleDir: tmpDir,
+		Attached:  true,
+	}
+
+	ct, err := container.New(conf, containerArgs)
+	if err != nil {
+		return Errorf("creating container: %v", err)
+	}
+	defer ct.Destroy()
+
+	if err := ct.Start(conf); err != nil {
+		return Errorf("starting container: %v", err)
+	}
+
+	// Forward signals to init in the container. Thus if we get SIGINT from
+	// ^C, the container gracefully exit, and we can clean up.
+	//
+	// N.B. There is a still a window before this where a signal may kill
+	// this process, skipping cleanup.
+	stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */)
+	defer stopForwarding()
+
+	ws, err := ct.Wait()
+	if err != nil {
+		return Errorf("waiting for container: %v", err)
+	}
+
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 4cb0164dd..6a755ecb6 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -176,7 +176,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	mountIdx := 1 // first one is the root
 	for _, m := range spec.Mounts {
-		if specutils.Is9PMount(m) {
+		if specutils.Is9PMount(m, conf.VFS2) {
 			cfg := fsgofer.Config{
 				ROMount:           isReadonlyMount(m.Options) || conf.Overlay,
 				HostUDS:           conf.FSGoferHostUDS,
@@ -350,7 +350,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error {
 // creates directories as needed.
 func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error {
 	for _, m := range mounts {
-		if m.Type != "bind" || !specutils.IsVFS1SupportedDevMount(m) {
+		if !specutils.Is9PMount(m, conf.VFS2) {
 			continue
 		}
 
@@ -390,7 +390,7 @@ func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error {
 func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
 	cleanMounts := make([]specs.Mount, 0, len(mounts))
 	for _, m := range mounts {
-		if m.Type != "bind" || !specutils.IsVFS1SupportedDevMount(m) {
+		if !specutils.Is9PMount(m, conf.VFS2) {
 			cleanMounts = append(cleanMounts, m)
 			continue
 		}
diff --git a/runsc/cmd/mitigate.go b/runsc/cmd/mitigate.go
index 720141aa5..f4e65adb8 100644
--- a/runsc/cmd/mitigate.go
+++ b/runsc/cmd/mitigate.go
@@ -41,8 +41,8 @@ type Mitigate struct {
 	reverse bool
 	// Path to file to read to create CPUSet.
 	path string
-	// Callback to check if a given thread is vulnerable.
-	vulnerable func(other mitigate.Thread) bool
+	// Extra data for post mitigate operations.
+	data string
 }
 
 // Name implements subcommands.command.name.
@@ -55,19 +55,20 @@ func (*Mitigate) Synopsis() string {
 	return "mitigate mitigates the underlying system against side channel attacks"
 }
 
-// Usage implments Usage for cmd.Mitigate.
+// Usage implements Usage for cmd.Mitigate.
 func (m Mitigate) Usage() string {
-	return `mitigate [flags]
+	return fmt.Sprintf(`mitigate [flags]
 
 mitigate mitigates a system to the "MDS" vulnerability by implementing a manual shutdown of SMT. The command checks /proc/cpuinfo for cpus having the MDS vulnerability, and if found, shutdown all but one CPU per hyperthread pair via /sys/devices/system/cpu/cpu{N}/online. CPUs can be restored by writing "2" to each file in /sys/devices/system/cpu/cpu{N}/online or performing a system reboot.
 
-The command can be reversed with --reverse, which reads the total CPUs from /sys/devices/system/cpu/possible and enables all with /sys/devices/system/cpu/cpu{N}/online.`
+The command can be reversed with --reverse, which reads the total CPUs from /sys/devices/system/cpu/possible and enables all with /sys/devices/system/cpu/cpu{N}/online.%s`, m.usage())
 }
 
 // SetFlags sets flags for the command Mitigate.
 func (m *Mitigate) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&m.dryRun, "dryrun", false, "run the command without changing system")
 	f.BoolVar(&m.reverse, "reverse", false, "reverse mitigate by enabling all CPUs")
+	m.setFlags(f)
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -87,13 +88,17 @@ func (m *Mitigate) Execute(_ context.Context, f *flag.FlagSet, args ...interface
 		m.path = allPossibleCPUs
 	}
 
-	m.vulnerable = func(other mitigate.Thread) bool {
-		return other.IsVulnerable()
+	set, err := m.doExecute()
+	if err != nil {
+		return Errorf("Execute failed: %v", err)
 	}
 
-	if _, err := m.doExecute(); err != nil {
-		log.Warningf("Execute failed: %v", err)
-		return subcommands.ExitFailure
+	if m.data == "" {
+		return subcommands.ExitSuccess
+	}
+
+	if err = m.postMitigate(set); err != nil {
+		return Errorf("Post Mitigate failed: %v", err)
 	}
 
 	return subcommands.ExitSuccess
@@ -104,32 +109,26 @@ func (m *Mitigate) doExecute() (mitigate.CPUSet, error) {
 	if m.dryRun {
 		log.Infof("Running with DryRun. No cpu settings will be changed.")
 	}
+	data, err := ioutil.ReadFile(m.path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read %s: %w", m.path, err)
+	}
 	if m.reverse {
-		data, err := ioutil.ReadFile(m.path)
-		if err != nil {
-			return nil, fmt.Errorf("failed to read %s: %v", m.path, err)
-		}
-
 		set, err := m.doReverse(data)
 		if err != nil {
-			return nil, fmt.Errorf("reverse operation failed: %v", err)
+			return nil, fmt.Errorf("reverse operation failed: %w", err)
 		}
 		return set, nil
 	}
-
-	data, err := ioutil.ReadFile(m.path)
-	if err != nil {
-		return nil, fmt.Errorf("failed to read %s: %v", m.path, err)
-	}
 	set, err := m.doMitigate(data)
 	if err != nil {
-		return nil, fmt.Errorf("mitigate operation failed: %v", err)
+		return nil, fmt.Errorf("mitigate operation failed: %w", err)
 	}
 	return set, nil
 }
 
 func (m *Mitigate) doMitigate(data []byte) (mitigate.CPUSet, error) {
-	set, err := mitigate.NewCPUSet(data, m.vulnerable)
+	set, err := mitigate.NewCPUSet(data)
 	if err != nil {
 		return nil, err
 	}
@@ -145,7 +144,7 @@ func (m *Mitigate) doMitigate(data []byte) (mitigate.CPUSet, error) {
 			continue
 		}
 		if err := t.Disable(); err != nil {
-			return nil, fmt.Errorf("error disabling thread: %s err: %v", t, err)
+			return nil, fmt.Errorf("error disabling thread: %s err: %w", t, err)
 		}
 	}
 	log.Infof("Shutdown successful.")
@@ -170,7 +169,7 @@ func (m *Mitigate) doReverse(data []byte) (mitigate.CPUSet, error) {
 			continue
 		}
 		if err := t.Enable(); err != nil {
-			return nil, fmt.Errorf("error enabling thread: %s err: %v", t, err)
+			return nil, fmt.Errorf("error enabling thread: %s err: %w", t, err)
 		}
 	}
 	log.Infof("Enable successful.")
diff --git a/runsc/cmd/mitigate_extras.go b/runsc/cmd/mitigate_extras.go
new file mode 100644
index 000000000..2cb2833f0
--- /dev/null
+++ b/runsc/cmd/mitigate_extras.go
@@ -0,0 +1,33 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/mitigate"
+)
+
+// usage returns any extra bits of the usage string.
+func (m *Mitigate) usage() string {
+	return ""
+}
+
+// setFlags sets extra flags for the command Mitigate.
+func (m *Mitigate) setFlags(f *flag.FlagSet) {}
+
+// postMitigate handles any postMitigate actions.
+func (m *Mitigate) postMitigate(_ mitigate.CPUSet) error {
+	return nil
+}
diff --git a/runsc/cmd/mitigate_test.go b/runsc/cmd/mitigate_test.go
index 54211ce32..2d3fef7c1 100644
--- a/runsc/cmd/mitigate_test.go
+++ b/runsc/cmd/mitigate_test.go
@@ -23,7 +23,6 @@ import (
 	"strings"
 	"testing"
 
-	"gvisor.dev/gvisor/runsc/mitigate"
 	"gvisor.dev/gvisor/runsc/mitigate/mock"
 )
 
@@ -86,9 +85,6 @@ power management::84
 		t.Run(tc.name, func(t *testing.T) {
 			m := &Mitigate{
 				dryRun: true,
-				vulnerable: func(other mitigate.Thread) bool {
-					return other.IsVulnerable()
-				},
 			}
 			m.doExecuteTest(t, "Mitigate", tc.mitigateData, tc.mitigateCPU, tc.mitigateError)
 
@@ -106,9 +102,6 @@ func TestExecuteSmoke(t *testing.T) {
 
 	m := &Mitigate{
 		dryRun: true,
-		vulnerable: func(other mitigate.Thread) bool {
-			return other.IsVulnerable()
-		},
 	}
 
 	m.doExecuteTest(t, "Mitigate", string(smokeMitigate), 0, nil)
diff --git a/runsc/cmd/symbolize.go b/runsc/cmd/symbolize.go
index fc0c69358..0fa4bfda1 100644
--- a/runsc/cmd/symbolize.go
+++ b/runsc/cmd/symbolize.go
@@ -65,13 +65,15 @@ func (c *Symbolize) Execute(_ context.Context, f *flag.FlagSet, args ...interfac
 		f.Usage()
 		return subcommands.ExitUsageError
 	}
-	if !coverage.KcovAvailable() {
+	if !coverage.Available() {
 		return Errorf("symbolize can only be used when coverage is available.")
 	}
 	coverage.InitCoverageData()
 
 	if c.dumpAll {
-		coverage.WriteAllBlocks(os.Stdout)
+		if err := coverage.WriteAllBlocks(os.Stdout); err != nil {
+			return Errorf("Failed to write out blocks: %v", err)
+		}
 		return subcommands.ExitSuccess
 	}
 
diff --git a/runsc/cmd/verity_prepare.go b/runsc/cmd/verity_prepare.go
new file mode 100644
index 000000000..66128b2a3
--- /dev/null
+++ b/runsc/cmd/verity_prepare.go
@@ -0,0 +1,108 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"math/rand"
+	"os"
+
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/runsc/config"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// VerityPrepare implements subcommands.Commands for the "verity-prepare"
+// command. It sets up a sandbox with a writable verity mount mapped to "--dir",
+// and executes the verity measure tool specified by "--tool" in the sandbox. It
+// is intended to prepare --dir to be mounted as a verity filesystem.
+type VerityPrepare struct {
+	root string
+	tool string
+	dir  string
+}
+
+// Name implements subcommands.Command.Name.
+func (*VerityPrepare) Name() string {
+	return "verity-prepare"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*VerityPrepare) Synopsis() string {
+	return "Generates the data structures necessary to enable verityfs on a filesystem."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*VerityPrepare) Usage() string {
+	return "verity-prepare --tool=<measure_tool> --dir=<path>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *VerityPrepare) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
+	f.StringVar(&c.tool, "tool", "", "path to the verity measure_tool")
+	f.StringVar(&c.dir, "dir", "", "path to the directory to be hashed")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *VerityPrepare) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	conf := args[0].(*config.Config)
+	waitStatus := args[1].(*unix.WaitStatus)
+
+	hostname, err := os.Hostname()
+	if err != nil {
+		return Errorf("Error to retrieve hostname: %v", err)
+	}
+
+	// Map the entire host file system.
+	absRoot, err := resolvePath(c.root)
+	if err != nil {
+		return Errorf("Error resolving root: %v", err)
+	}
+
+	spec := &specs.Spec{
+		Root: &specs.Root{
+			Path: absRoot,
+		},
+		Process: &specs.Process{
+			Cwd:          absRoot,
+			Args:         []string{c.tool, "--path", "/verityroot"},
+			Env:          os.Environ(),
+			Capabilities: specutils.AllCapabilities(),
+		},
+		Hostname: hostname,
+		Mounts: []specs.Mount{
+			specs.Mount{
+				Source:      c.dir,
+				Destination: "/verityroot",
+				Type:        "bind",
+				Options:     []string{"verity.roothash="},
+			},
+		},
+	}
+
+	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
+
+	// Force no networking, it is not necessary to run the verity measure tool.
+	conf.Network = config.NetworkNone
+
+	conf.Verity = true
+
+	return startContainerAndWait(spec, conf, cid, waitStatus)
+}
diff --git a/runsc/config/config.go b/runsc/config/config.go
index 1e5858837..fa550ebf7 100644
--- a/runsc/config/config.go
+++ b/runsc/config/config.go
@@ -55,6 +55,9 @@ type Config struct {
 	// PanicLog is the path to log GO's runtime messages, if not empty.
 	PanicLog string `flag:"panic-log"`
 
+	// CoverageReport is the path to write Go coverage information, if not empty.
+	CoverageReport string `flag:"coverage-report"`
+
 	// DebugLogFormat is the log format for debug.
 	DebugLogFormat string `flag:"debug-log-format"`
 
@@ -172,6 +175,9 @@ type Config struct {
 	// Enables seccomp inside the sandbox.
 	OCISeccomp bool `flag:"oci-seccomp"`
 
+	// Mounts the cgroup filesystem backed by the sentry's cgroupfs.
+	Cgroupfs bool `flag:"cgroupfs"`
+
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
 	// tests. It allows runsc to start the sandbox process as the current
 	// user, and without chrooting the sandbox process. This can be
diff --git a/runsc/config/flags.go b/runsc/config/flags.go
index 1d996c841..c3dca2352 100644
--- a/runsc/config/flags.go
+++ b/runsc/config/flags.go
@@ -44,7 +44,8 @@ func RegisterFlags() {
 
 		// Debugging flags.
 		flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-		flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
+		flag.String("panic-log", "", "file path where panic reports and other Go's runtime messages are written.")
+		flag.String("coverage-report", "", "file path where Go coverage reports are written. Reports will only be generated if runsc is built with --collect_code_coverage and --instrumentation_filter Bazel flags.")
 		flag.Bool("log-packets", false, "enable network packet logging.")
 		flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
 		flag.Bool("alsologtostderr", false, "send log messages to stderr.")
@@ -75,6 +76,7 @@ func RegisterFlags() {
 		flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
 		flag.Bool("vfs2", false, "enables VFSv2. This uses the new VFS layer that is faster than the previous one.")
 		flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
+		flag.Bool("cgroupfs", false, "Automatically mount cgroupfs.")
 
 		// Flags that control sandbox runtime behavior: network related.
 		flag.Var(networkTypePtr(NetworkSandbox), "network", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 3620dc8c3..5314549d6 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -51,9 +51,7 @@ go_test(
     ],
     library = ":container",
     shard_count = more_shards,
-    tags = [
-        "requires-kvm",
-    ],
+    tags = ["requires-kvm"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/bits",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index f9d83c118..0820edaec 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -233,7 +233,7 @@ func New(conf *config.Config, args Args) (*Container, error) {
 		}
 		// Create and join cgroup before processes are created to ensure they are
 		// part of the cgroup from the start (and all their children processes).
-		cg, err := cgroup.New(args.Spec)
+		cg, err := cgroup.NewFromSpec(args.Spec)
 		if err != nil {
 			return nil, err
 		}
@@ -886,7 +886,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu
 	// Add root mount and then add any other additional mounts.
 	mountCount := 1
 	for _, m := range spec.Mounts {
-		if specutils.Is9PMount(m) {
+		if specutils.Is9PMount(m, conf.VFS2) {
 			mountCount++
 		}
 	}
@@ -1132,7 +1132,7 @@ func (c *Container) populateStats(event *boot.EventOut) {
 	// account for the full cgroup CPU usage. We split cgroup usage
 	// proportionally according to the sentry-internal usage measurements,
 	// only counting Running containers.
-	log.Warningf("event.ContainerUsage: %v", event.ContainerUsage)
+	log.Debugf("event.ContainerUsage: %v", event.ContainerUsage)
 	var containerUsage uint64
 	var allContainersUsage uint64
 	for ID, usage := range event.ContainerUsage {
@@ -1142,7 +1142,7 @@ func (c *Container) populateStats(event *boot.EventOut) {
 		}
 	}
 
-	cgroup, err := c.Sandbox.FindCgroup()
+	cgroup, err := c.Sandbox.NewCGroup()
 	if err != nil {
 		// No cgroup, so rely purely on the sentry's accounting.
 		log.Warningf("events: no cgroups")
@@ -1159,17 +1159,18 @@ func (c *Container) populateStats(event *boot.EventOut) {
 		return
 	}
 
-	// If the sentry reports no memory usage, fall back on cgroups and
-	// split usage equally across containers.
+	// If the sentry reports no CPU usage, fall back on cgroups and split usage
+	// equally across containers.
 	if allContainersUsage == 0 {
 		log.Warningf("events: no sentry CPU usage reported")
 		allContainersUsage = cgroupsUsage
 		containerUsage = cgroupsUsage / uint64(len(event.ContainerUsage))
 	}
 
-	log.Warningf("%f, %f, %f", containerUsage, cgroupsUsage, allContainersUsage)
 	// Scaling can easily overflow a uint64 (e.g. a containerUsage and
 	// cgroupsUsage of 16 seconds each will overflow), so use floats.
-	event.Event.Data.CPU.Usage.Total = uint64(float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage)))
+	total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage))
+	log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total)
+	event.Event.Data.CPU.Usage.Total = uint64(total)
 	return
 }
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 5a0c468a4..0e79877b7 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -2449,6 +2449,27 @@ func TestCreateWithCorruptedStateFile(t *testing.T) {
 	}
 }
 
+func TestBindMountByOption(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "bind-mount")
+			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+			if err != nil {
+				t.Fatalf("ioutil.TempDir(): %v", err)
+			}
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      dir,
+				Type:        "none",
+				Options:     []string{"rw", "bind"},
+			})
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("error running sandbox: %v", err)
+			}
+		})
+	}
+}
+
 func execute(cont *Container, name string, arg ...string) (unix.WaitStatus, error) {
 	args := &control.ExecArgs{
 		Filename: name,
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 0f0a223ce..0dbe1e323 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -25,6 +25,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/cleanup"
@@ -1510,7 +1511,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
 				Destination: "/mydir/test",
 				Source:      "/some/dir",
 				Type:        "tmpfs",
-				Options:     []string{"rw", "rbind", "relatime"},
+				Options:     []string{"rw", "relatime"},
 			}
 			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
@@ -1917,9 +1918,9 @@ func TestMultiContainerEvent(t *testing.T) {
 	}
 	defer cleanup()
 
-	for _, cont := range containers {
-		t.Logf("Running containerd %s", cont.ID)
-	}
+	t.Logf("Running container sleep %s", containers[0].ID)
+	t.Logf("Running container busy %s", containers[1].ID)
+	t.Logf("Running container quick %s", containers[2].ID)
 
 	// Wait for last container to stabilize the process count that is
 	// checked further below.
@@ -1940,50 +1941,61 @@ func TestMultiContainerEvent(t *testing.T) {
 	}
 
 	// Check events for running containers.
-	var prevUsage uint64
 	for _, cont := range containers[:2] {
 		ret, err := cont.Event()
 		if err != nil {
-			t.Errorf("Container.Events(): %v", err)
+			t.Errorf("Container.Event(%q): %v", cont.ID, err)
 		}
 		evt := ret.Event
 		if want := "stats"; evt.Type != want {
-			t.Errorf("Wrong event type, want: %s, got: %s", want, evt.Type)
+			t.Errorf("Wrong event type, cid: %q, want: %s, got: %s", cont.ID, want, evt.Type)
 		}
 		if cont.ID != evt.ID {
 			t.Errorf("Wrong container ID, want: %s, got: %s", cont.ID, evt.ID)
 		}
 		// One process per remaining container.
 		if got, want := evt.Data.Pids.Current, uint64(2); got != want {
-			t.Errorf("Wrong number of PIDs, want: %d, got: %d", want, got)
+			t.Errorf("Wrong number of PIDs, cid: %q, want: %d, got: %d", cont.ID, want, got)
 		}
 
-		// Both remaining containers should have nonzero usage, and
-		// 'busy' should have higher usage than 'sleep'.
-		usage := evt.Data.CPU.Usage.Total
-		if usage == 0 {
-			t.Errorf("Running container should report nonzero CPU usage, but got %d", usage)
+		// The exited container should always have a usage of zero.
+		if exited := ret.ContainerUsage[containers[2].ID]; exited != 0 {
+			t.Errorf("Exited container should report 0 CPU usage, got: %d", exited)
+		}
+	}
+
+	// Check that CPU reported by busy container is higher than sleep.
+	cb := func() error {
+		sleepEvt, err := containers[0].Event()
+		if err != nil {
+			return &backoff.PermanentError{Err: err}
 		}
-		if usage <= prevUsage {
-			t.Errorf("Expected container %s to use more than %d ns of CPU, but used %d", cont.ID, prevUsage, usage)
+		sleepUsage := sleepEvt.Event.Data.CPU.Usage.Total
+
+		busyEvt, err := containers[1].Event()
+		if err != nil {
+			return &backoff.PermanentError{Err: err}
 		}
-		t.Logf("Container %s usage: %d", cont.ID, usage)
-		prevUsage = usage
+		busyUsage := busyEvt.Event.Data.CPU.Usage.Total
 
-		// The exited container should have a usage of zero.
-		if exited := ret.ContainerUsage[containers[2].ID]; exited != 0 {
-			t.Errorf("Exited container should report 0 CPU usage, but got %d", exited)
+		if busyUsage <= sleepUsage {
+			t.Logf("Busy container usage lower than sleep (busy: %d, sleep: %d), retrying...", busyUsage, sleepUsage)
+			return fmt.Errorf("Busy container should have higher usage than sleep, busy: %d, sleep: %d", busyUsage, sleepUsage)
 		}
+		return nil
+	}
+	// Give time for busy container to run and use more CPU than sleep.
+	if err := testutil.Poll(cb, 10*time.Second); err != nil {
+		t.Fatal(err)
 	}
 
-	// Check that stop and destroyed containers return error.
+	// Check that stopped and destroyed containers return error.
 	if err := containers[1].Destroy(); err != nil {
 		t.Fatalf("container.Destroy: %v", err)
 	}
 	for _, cont := range containers[1:] {
-		_, err := cont.Event()
-		if err == nil {
-			t.Errorf("Container.Events() should have failed, cid:%s, state: %v", cont.ID, cont.Status)
+		if _, err := cont.Event(); err == nil {
+			t.Errorf("Container.Event() should have failed, cid: %q, state: %v", cont.ID, cont.Status)
 		}
 	}
 }
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index e04ddda47..b81ede5ae 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -21,6 +21,7 @@
 package fsgofer
 
 import (
+	"errors"
 	"fmt"
 	"io"
 	"math"
@@ -58,9 +59,6 @@ var verityXattrs = map[string]struct{}{
 
 // join is equivalent to path.Join() but skips path.Clean() which is expensive.
 func join(parent, child string) string {
-	if child == "." || child == ".." {
-		panic(fmt.Sprintf("invalid child path %q", child))
-	}
 	return parent + "/" + child
 }
 
@@ -1226,3 +1224,56 @@ func (l *localFile) checkROMount() error {
 	}
 	return nil
 }
+
+func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) {
+	stats := make([]p9.FullStat, 0, len(names))
+
+	if len(names) > 0 && names[0] == "" {
+		qid, valid, attr, err := l.GetAttr(p9.AttrMask{})
+		if err != nil {
+			return nil, err
+		}
+		stats = append(stats, p9.FullStat{
+			QID:   qid,
+			Valid: valid,
+			Attr:  attr,
+		})
+		names = names[1:]
+	}
+
+	parent := l.file.FD()
+	for _, name := range names {
+		child, err := unix.Openat(parent, name, openFlags|unix.O_PATH, 0)
+		if parent != l.file.FD() {
+			// Parent is no longer needed.
+			_ = unix.Close(parent)
+		}
+		if err != nil {
+			if errors.Is(err, unix.ENOENT) {
+				// No pont in continuing any further.
+				break
+			}
+			return nil, err
+		}
+
+		var stat unix.Stat_t
+		if err := unix.Fstat(child, &stat); err != nil {
+			_ = unix.Close(child)
+			return nil, err
+		}
+		valid, attr := l.fillAttr(&stat)
+		stats = append(stats, p9.FullStat{
+			QID:   l.attachPoint.makeQID(&stat),
+			Valid: valid,
+			Attr:  attr,
+		})
+		if (stat.Mode & unix.S_IFMT) != unix.S_IFDIR {
+			// Doesn't need to continue if entry is not a dir. Including symlinks
+			// that cannot be followed.
+			_ = unix.Close(child)
+			break
+		}
+		parent = child
+	}
+	return stats, nil
+}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index d7e141476..77723827a 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -703,16 +703,6 @@ func TestWalkNotFound(t *testing.T) {
 	})
 }
 
-func TestWalkPanic(t *testing.T) {
-	runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
-		for _, name := range []string{".", ".."} {
-			assertPanic(t, func() {
-				s.file.Walk([]string{name})
-			})
-		}
-	})
-}
-
 func TestWalkDup(t *testing.T) {
 	runAll(t, func(t *testing.T, s state) {
 		_, dup, err := s.file.Walk([]string{})
diff --git a/runsc/mitigate/mitigate.go b/runsc/mitigate/mitigate.go
index 24f67414c..88409af8f 100644
--- a/runsc/mitigate/mitigate.go
+++ b/runsc/mitigate/mitigate.go
@@ -50,7 +50,7 @@ const (
 type CPUSet map[threadID]*ThreadGroup
 
 // NewCPUSet creates a CPUSet from data read from /proc/cpuinfo.
-func NewCPUSet(data []byte, vulnerable func(Thread) bool) (CPUSet, error) {
+func NewCPUSet(data []byte) (CPUSet, error) {
 	processors, err := getThreads(string(data))
 	if err != nil {
 		return nil, err
@@ -67,7 +67,7 @@ func NewCPUSet(data []byte, vulnerable func(Thread) bool) (CPUSet, error) {
 			core = &ThreadGroup{}
 			set[p.id] = core
 		}
-		core.isVulnerable = core.isVulnerable || vulnerable(p)
+		core.isVulnerable = core.isVulnerable || p.IsVulnerable()
 		core.threads = append(core.threads, p)
 	}
 
@@ -446,6 +446,7 @@ func buildRegex(key, match string) *regexp.Regexp {
 func parseRegex(data, key, match string) (string, error) {
 	r := buildRegex(key, match)
 	matches := r.FindStringSubmatch(data)
+
 	if len(matches) < 2 {
 		return "", fmt.Errorf("failed to match key %q: %q", key, data)
 	}
diff --git a/runsc/mitigate/mitigate_test.go b/runsc/mitigate/mitigate_test.go
index bd5a2433f..890c65f05 100644
--- a/runsc/mitigate/mitigate_test.go
+++ b/runsc/mitigate/mitigate_test.go
@@ -54,14 +54,13 @@ func TestMockCPUSet(t *testing.T) {
 	} {
 		t.Run(tc.testCase.Name, func(t *testing.T) {
 			data := tc.testCase.MakeCPUString()
-			vulnerable := func(t Thread) bool {
-				return t.IsVulnerable()
-			}
-			set, err := NewCPUSet([]byte(data), vulnerable)
+			set, err := NewCPUSet([]byte(data))
 			if err != nil {
 				t.Fatalf("Failed to create cpuSet: %v", err)
 			}
 
+			t.Logf("data: %s", data)
+
 			for _, tg := range set {
 				if err := checkSorted(tg.threads); err != nil {
 					t.Fatalf("Failed to sort cpuSet: %v", err)
@@ -260,11 +259,7 @@ func TestReadFile(t *testing.T) {
 		t.Fatalf("Failed to read cpuinfo: %v", err)
 	}
 
-	vulnerable := func(t Thread) bool {
-		return t.IsVulnerable()
-	}
-
-	set, err := NewCPUSet(data, vulnerable)
+	set, err := NewCPUSet(data)
 	if err != nil {
 		t.Fatalf("Failed to parse CPU data %v\n%s", err, data)
 	}
diff --git a/runsc/mitigate/mock/mock.go b/runsc/mitigate/mock/mock.go
index 2db718cb9..12c59e356 100644
--- a/runsc/mitigate/mock/mock.go
+++ b/runsc/mitigate/mock/mock.go
@@ -82,6 +82,19 @@ var Haswell2core = CPU{
 	ThreadsPerCore: 1,
 }
 
+// AMD2 is an two core AMD machine.
+var AMD2 = CPU{
+	Name:           "AMD",
+	VendorID:       "AuthenticAMD",
+	Family:         23,
+	Model:          49,
+	ModelName:      "AMD EPYC 7B12",
+	Bugs:           "sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass",
+	PhysicalCores:  1,
+	Cores:          1,
+	ThreadsPerCore: 2,
+}
+
 // AMD8 is an eight core AMD machine.
 var AMD8 = CPU{
 	Name:           "AMD",
@@ -115,15 +128,15 @@ bugs		: %s
 			for k := 0; k < tc.ThreadsPerCore; k++ {
 				processorNum := (i*tc.Cores+j)*tc.ThreadsPerCore + k
 				ret += fmt.Sprintf(template,
-					processorNum,              /*processor*/
-					tc.VendorID,               /*vendor_id*/
-					tc.Family,                 /*cpu family*/
-					tc.Model,                  /*model*/
-					tc.ModelName,              /*model name*/
-					i,                         /*physical id*/
-					j,                         /*core id*/
-					tc.Cores*tc.PhysicalCores, /*cpu cores*/
-					tc.Bugs,                   /*bugs*/
+					processorNum, /*processor*/
+					tc.VendorID,  /*vendor_id*/
+					tc.Family,    /*cpu family*/
+					tc.Model,     /*model*/
+					tc.ModelName, /*model name*/
+					i,            /*physical id*/
+					j,            /*core id*/
+					k,            /*cpu cores*/
+					tc.Bugs,      /*bugs*/
 				)
 			}
 		}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index f0a551a1e..bc4a3fa32 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -16,6 +16,7 @@ go_library(
         "//pkg/cleanup",
         "//pkg/control/client",
         "//pkg/control/server",
+        "//pkg/coverage",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/platform",
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 450f92645..8d31e33b2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -34,6 +34,7 @@ import (
 	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/control/client"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/coverage"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -309,20 +310,9 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	return pl, nil
 }
 
-// FindCgroup returns the sandbox's Cgroup, or an error if it does not have one.
-func (s *Sandbox) FindCgroup() (*cgroup.Cgroup, error) {
-	paths, err := cgroup.LoadPaths(strconv.Itoa(s.Pid))
-	if err != nil {
-		return nil, err
-	}
-	// runsc places sandboxes in the same cgroup for each controller, so we
-	// pick an arbitrary controller here to get the cgroup path.
-	const controller = "cpuacct"
-	controllerPath, ok := paths[controller]
-	if !ok {
-		return nil, fmt.Errorf("no %q controller found", controller)
-	}
-	return cgroup.NewFromPath(controllerPath)
+// NewCGroup returns the sandbox's Cgroup, or an error if it does not have one.
+func (s *Sandbox) NewCGroup() (*cgroup.Cgroup, error) {
+	return cgroup.NewFromPid(s.Pid)
 }
 
 // Execute runs the specified command in the container. It returns the PID of
@@ -399,15 +389,15 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
 		cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD))
 		nextFD++
 	}
-	if conf.DebugLog != "" {
-		test := ""
-		if len(conf.TestOnlyTestNameEnv) != 0 {
-			// Fetch test name if one is provided and the test only flag was set.
-			if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
-				test = t
-			}
-		}
 
+	test := ""
+	if len(conf.TestOnlyTestNameEnv) != 0 {
+		// Fetch test name if one is provided and the test only flag was set.
+		if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+			test = t
+		}
+	}
+	if conf.DebugLog != "" {
 		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot", test)
 		if err != nil {
 			return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
@@ -418,23 +408,29 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
 		nextFD++
 	}
 	if conf.PanicLog != "" {
-		test := ""
-		if len(conf.TestOnlyTestNameEnv) != 0 {
-			// Fetch test name if one is provided and the test only flag was set.
-			if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
-				test = t
-			}
-		}
-
 		panicLogFile, err := specutils.DebugLogFile(conf.PanicLog, "panic", test)
 		if err != nil {
-			return fmt.Errorf("opening debug log file in %q: %v", conf.PanicLog, err)
+			return fmt.Errorf("opening panic log file in %q: %v", conf.PanicLog, err)
 		}
 		defer panicLogFile.Close()
 		cmd.ExtraFiles = append(cmd.ExtraFiles, panicLogFile)
 		cmd.Args = append(cmd.Args, "--panic-log-fd="+strconv.Itoa(nextFD))
 		nextFD++
 	}
+	covFilename := conf.CoverageReport
+	if covFilename == "" {
+		covFilename = os.Getenv("GO_COVERAGE_FILE")
+	}
+	if covFilename != "" && coverage.Available() {
+		covFile, err := specutils.DebugLogFile(covFilename, "cov", test)
+		if err != nil {
+			return fmt.Errorf("opening debug log file in %q: %v", covFilename, err)
+		}
+		defer covFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, covFile)
+		cmd.Args = append(cmd.Args, "--coverage-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
 
 	// Add the "boot" command to the args.
 	//
@@ -486,7 +482,7 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
 	}
 
 	if deviceFile, err := gPlatform.OpenDevice(); err != nil {
-		return fmt.Errorf("opening device file for platform %q: %v", gPlatform, err)
+		return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err)
 	} else if deviceFile != nil {
 		defer deviceFile.Close()
 		cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile)
@@ -1174,7 +1170,7 @@ func deviceFileForPlatform(name string) (*os.File, error) {
 
 	f, err := p.OpenDevice()
 	if err != nil {
-		return nil, fmt.Errorf("opening device file for platform %q: %v", p, err)
+		return nil, fmt.Errorf("opening device file for platform %q: %w", name, err)
 	}
 	return f, nil
 }
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index b62504a8c..9ecd0fde6 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"math/bits"
 	"path"
+	"strings"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
@@ -64,6 +65,12 @@ var optionsMap = map[string]mapping{
 	"sync":          {set: true, val: unix.MS_SYNCHRONOUS},
 }
 
+// verityMountOptions is the set of valid verity mount option keys.
+var verityMountOptions = map[string]struct{}{
+	"verity.roothash": struct{}{},
+	"verity.action":   struct{}{},
+}
+
 // propOptionsMap is similar to optionsMap, but it lists propagation options
 // that cannot be used together with other flags.
 var propOptionsMap = map[string]mapping{
@@ -117,6 +124,14 @@ func validateMount(mnt *specs.Mount) error {
 	return nil
 }
 
+func moptKey(opt string) string {
+	if len(opt) == 0 {
+		return opt
+	}
+	// Guaranteed to have at least one token, since opt is not empty.
+	return strings.SplitN(opt, "=", 2)[0]
+}
+
 // ValidateMountOptions validates that mount options are correct.
 func ValidateMountOptions(opts []string) error {
 	for _, o := range opts {
@@ -125,7 +140,8 @@ func ValidateMountOptions(opts []string) error {
 		}
 		_, ok1 := optionsMap[o]
 		_, ok2 := propOptionsMap[o]
-		if !ok1 && !ok2 {
+		_, ok3 := verityMountOptions[moptKey(o)]
+		if !ok1 && !ok2 && !ok3 {
 			return fmt.Errorf("unknown mount option %q", o)
 		}
 		if err := validatePropagation(o); err != nil {
diff --git a/runsc/specutils/seccomp/BUILD b/runsc/specutils/seccomp/BUILD
index e9e647d82..c5f5b863e 100644
--- a/runsc/specutils/seccomp/BUILD
+++ b/runsc/specutils/seccomp/BUILD
@@ -28,8 +28,10 @@ go_test(
     srcs = ["seccomp_test.go"],
     library = ":seccomp",
     deps = [
-        "//pkg/binary",
+        "//pkg/abi/linux",
         "//pkg/bpf",
+        "//pkg/hostarch",
+        "//pkg/marshal",
         "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
diff --git a/runsc/specutils/seccomp/seccomp_test.go b/runsc/specutils/seccomp/seccomp_test.go
index 11a6c8daa..20796bf14 100644
--- a/runsc/specutils/seccomp/seccomp_test.go
+++ b/runsc/specutils/seccomp/seccomp_test.go
@@ -20,20 +20,15 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
+	"gvisor.dev/gvisor/pkg/hostarch"
+	"gvisor.dev/gvisor/pkg/marshal"
 )
 
-type seccompData struct {
-	nr                 uint32
-	arch               uint32
-	instructionPointer uint64
-	args               [6]uint64
-}
-
-// asInput converts a seccompData to a bpf.Input.
-func asInput(d seccompData) bpf.Input {
-	return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian}
+// asInput converts a linux.SeccompData to a bpf.Input.
+func asInput(d *linux.SeccompData) bpf.Input {
+	return bpf.InputBytes{marshal.Marshal(d), hostarch.ByteOrder}
 }
 
 // testInput creates an Input struct with given seccomp input values.
@@ -49,13 +44,13 @@ func testInput(arch uint32, syscallName string, args *[6]uint64) bpf.Input {
 		args = &argArray
 	}
 
-	data := seccompData{
-		nr:   syscallNo,
-		arch: arch,
-		args: *args,
+	data := linux.SeccompData{
+		Nr:   int32(syscallNo),
+		Arch: arch,
+		Args: *args,
 	}
 
-	return asInput(data)
+	return asInput(&data)
 }
 
 // testCase holds a seccomp test case.
@@ -100,7 +95,7 @@ var (
 			},
 			// Syscall matches but the arch is AUDIT_ARCH_X86 so the return
 			// value is the bad arch action.
-			input:    asInput(seccompData{nr: 183, arch: 0x40000003}), //
+			input:    asInput(&linux.SeccompData{Nr: 183, Arch: 0x40000003}), //
 			expected: uint32(killThreadAction),
 		},
 		{
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 45856fd58..11b476690 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -332,14 +332,38 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.
 	return auth.CapabilitySetOfMany(caps), nil
 }
 
-// Is9PMount returns true if the given mount can be mounted as an external gofer.
-func Is9PMount(m specs.Mount) bool {
-	return m.Type == "bind" && m.Source != "" && IsVFS1SupportedDevMount(m)
+// Is9PMount returns true if the given mount can be mounted as an external
+// gofer.
+func Is9PMount(m specs.Mount, vfs2Enabled bool) bool {
+	MaybeConvertToBindMount(&m)
+	return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m, vfs2Enabled)
 }
 
-// IsVFS1SupportedDevMount returns true if m.Destination does not specify a
+// MaybeConvertToBindMount converts mount type to "bind" in case any of the
+// mount options are either "bind" or "rbind" as required by the OCI spec.
+//
+// "For bind mounts (when options include either bind or rbind), the type is a
+// dummy, often "none" (not listed in /proc/filesystems)."
+func MaybeConvertToBindMount(m *specs.Mount) {
+	if m.Type == "bind" {
+		return
+	}
+	for _, opt := range m.Options {
+		if opt == "bind" || opt == "rbind" {
+			m.Type = "bind"
+			return
+		}
+	}
+}
+
+// IsSupportedDevMount returns true if m.Destination does not specify a
 // path that is hardcoded by VFS1's implementation of /dev.
-func IsVFS1SupportedDevMount(m specs.Mount) bool {
+func IsSupportedDevMount(m specs.Mount, vfs2Enabled bool) bool {
+	// VFS2 has no hardcoded files under /dev, so everything is allowed.
+	if vfs2Enabled {
+		return true
+	}
+
 	// See pkg/sentry/fs/dev/dev.go.
 	var existingDevices = []string{
 		"/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr",