diff options
Diffstat (limited to 'runsc')
37 files changed, 1020 insertions, 433 deletions
diff --git a/runsc/BUILD b/runsc/BUILD index 3b91b984a..7a7dcc8d5 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -9,6 +9,7 @@ go_binary( "version.go", ], pure = True, + tags = ["staging"], visibility = [ "//visibility:public", ], @@ -49,5 +50,4 @@ sh_test( srcs = ["version_test.sh"], args = ["$(location :runsc)"], data = [":runsc"], - tags = ["noguitar"], ) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 67307ab3c..d51347fe1 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/cleanup", "//pkg/context", "//pkg/control/server", + "//pkg/coverage", "//pkg/cpuid", "//pkg/eventchannel", "//pkg/fd", @@ -37,6 +38,7 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/memutil", + "//pkg/metric", "//pkg/rand", "//pkg/refs", "//pkg/refsvfs2", @@ -57,6 +59,7 @@ go_library( "//pkg/sentry/fs/tmpfs", "//pkg/sentry/fs/tty", "//pkg/sentry/fs/user", + "//pkg/sentry/fsimpl/cgroupfs", "//pkg/sentry/fsimpl/devpts", "//pkg/sentry/fsimpl/devtmpfs", "//pkg/sentry/fsimpl/fuse", @@ -66,6 +69,7 @@ go_library( "//pkg/sentry/fsimpl/proc", "//pkg/sentry/fsimpl/sys", "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/fsimpl/verity", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel:uncaught_signal_go_proto", diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 1ae76d7d7..9b270cbf2 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -400,9 +400,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Set up the restore environment. ctx := k.SupervisorContext() - mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints, kernel.VFS2Enabled) + mntr := newContainerMounter(&cm.l.root, cm.l.k, cm.l.mountHints, kernel.VFS2Enabled) if kernel.VFS2Enabled { - ctx, err = mntr.configureRestore(ctx, cm.l.root.conf) + ctx, err = mntr.configureRestore(ctx) if err != nil { return fmt.Errorf("configuring filesystem restore: %v", err) } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 32adde643..bf4a41f77 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/gofer" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/user" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/cgroupfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" @@ -103,17 +104,22 @@ func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name // compileMounts returns the supported mounts from the mount spec, adding any // mandatory mounts that are required by the OCI specification. -func compileMounts(spec *specs.Spec, vfs2Enabled bool) []specs.Mount { +func compileMounts(spec *specs.Spec, conf *config.Config, vfs2Enabled bool) []specs.Mount { // Keep track of whether proc and sys were mounted. var procMounted, sysMounted, devMounted, devptsMounted bool var mounts []specs.Mount // Mount all submounts from the spec. for _, m := range spec.Mounts { - if !vfs2Enabled && !specutils.IsVFS1SupportedDevMount(m) { + if !specutils.IsSupportedDevMount(m, vfs2Enabled) { log.Warningf("ignoring dev mount at %q", m.Destination) continue } + // Unconditionally drop any cgroupfs mounts. If requested, we'll add our + // own below. + if m.Type == cgroupfs.Name { + continue + } switch filepath.Clean(m.Destination) { case "/proc": procMounted = true @@ -132,6 +138,24 @@ func compileMounts(spec *specs.Spec, vfs2Enabled bool) []specs.Mount { // Mount proc and sys even if the user did not ask for it, as the spec // says we SHOULD. var mandatoryMounts []specs.Mount + + if conf.Cgroupfs { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: tmpfsvfs2.Name, + Destination: "/sys/fs/cgroup", + }) + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: cgroupfs.Name, + Destination: "/sys/fs/cgroup/memory", + Options: []string{"memory"}, + }) + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: cgroupfs.Name, + Destination: "/sys/fs/cgroup/cpu", + Options: []string{"cpu"}, + }) + } + if !procMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: procvfs2.Name, @@ -208,7 +232,7 @@ func parseMountOption(opt string, allowedKeys ...string) (bool, error) { // mountDevice returns a device string based on the fs type and target // of the mount. -func mountDevice(m specs.Mount) string { +func mountDevice(m *specs.Mount) string { if m.Type == bind { // Make a device string that includes the target, which is consistent across // S/R and uniquely identifies the connection. @@ -232,6 +256,8 @@ func mountFlags(opts []string) fs.MountSourceFlags { mf.NoAtime = true case "noexec": mf.NoExec = true + case "bind", "rbind": + // These are the same as a mount with type="bind". default: log.Warningf("ignoring unknown mount option %q", o) } @@ -248,6 +274,10 @@ func isSupportedMountFlag(fstype, opt string) bool { ok, err := parseMountOption(opt, tmpfsAllowedData...) return ok && err == nil } + if fstype == cgroupfs.Name { + ok, err := parseMountOption(opt, cgroupfs.SupportedMountOptions...) + return ok && err == nil + } return false } @@ -458,9 +488,9 @@ func (m *mountHint) isSupported() bool { // For now enforce that all options are the same. Once bind mount is properly // supported, then we should ensure the master is less restrictive than the // container, e.g. master can be 'rw' while container mounts as 'ro'. -func (m *mountHint) checkCompatible(mount specs.Mount) error { +func (m *mountHint) checkCompatible(mount *specs.Mount) error { // Remove options that don't affect to mount's behavior. - masterOpts := filterUnsupportedOptions(m.mount) + masterOpts := filterUnsupportedOptions(&m.mount) replicaOpts := filterUnsupportedOptions(mount) if len(masterOpts) != len(replicaOpts) { @@ -484,7 +514,7 @@ func (m *mountHint) fileAccessType() config.FileAccessType { return config.FileAccessShared } -func filterUnsupportedOptions(mount specs.Mount) []string { +func filterUnsupportedOptions(mount *specs.Mount) []string { rv := make([]string, 0, len(mount.Options)) for _, o := range mount.Options { if isSupportedMountFlag(mount.Type, o) { @@ -548,7 +578,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { return &podMountHints{mounts: mnts}, nil } -func (p *podMountHints) findMount(mount specs.Mount) *mountHint { +func (p *podMountHints) findMount(mount *specs.Mount) *mountHint { for _, m := range p.mounts { if m.mount.Source == mount.Source { return m @@ -572,11 +602,11 @@ type containerMounter struct { hints *podMountHints } -func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints, vfs2Enabled bool) *containerMounter { +func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *podMountHints, vfs2Enabled bool) *containerMounter { return &containerMounter{ - root: spec.Root, - mounts: compileMounts(spec, vfs2Enabled), - fds: fdDispenser{fds: goferFDs}, + root: info.spec.Root, + mounts: compileMounts(info.spec, info.conf, vfs2Enabled), + fds: fdDispenser{fds: info.goferFDs}, k: k, hints: hints, } @@ -651,7 +681,8 @@ func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Conf root := mns.Root() defer root.DecRef(ctx) - for _, m := range c.mounts { + for i := range c.mounts { + m := &c.mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { @@ -686,7 +717,7 @@ func (c *containerMounter) checkDispenser() error { func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. - fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount) + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, &hint.mount) if err != nil { return nil, err } @@ -706,7 +737,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.C mf.ReadOnly = true } - inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil) + inode, err := filesystem.Mount(ctx, mountDevice(&hint.mount), mf, strings.Join(opts, ","), nil) if err != nil { return nil, fmt.Errorf("creating mount %q: %v", hint.name, err) } @@ -768,13 +799,14 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) { +func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m *specs.Mount) (string, []string, bool, error) { + specutils.MaybeConvertToBindMount(m) + var ( fsName string opts []string useOverlay bool ) - switch m.Type { case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name: fsName = m.Type @@ -795,14 +827,20 @@ func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.M opts = p9MountData(fd, c.getMountAccessType(conf, m), conf.VFS2) // If configured, add overlay to all writable mounts. useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly - + case cgroupfs.Name: + fsName = m.Type + var err error + opts, err = parseAndFilterOptions(m.Options, cgroupfs.SupportedMountOptions...) + if err != nil { + return "", nil, false, err + } default: log.Warningf("ignoring unknown filesystem type %q", m.Type) } return fsName, opts, useOverlay, nil } -func (c *containerMounter) getMountAccessType(conf *config.Config, mount specs.Mount) config.FileAccessType { +func (c *containerMounter) getMountAccessType(conf *config.Config, mount *specs.Mount) config.FileAccessType { if hint := c.hints.findMount(mount); hint != nil { return hint.fileAccessType() } @@ -813,7 +851,7 @@ func (c *containerMounter) getMountAccessType(conf *config.Config, mount specs.M // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. // 'm.Destination' must be an absolute path with '..' and symlinks resolved. -func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { +func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m *specs.Mount) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) @@ -887,7 +925,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Confi // mountSharedSubmount binds mount to a previously mounted volume that is shared // among containers in the same pod. -func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error { +func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount *specs.Mount, source *mountHint) error { if err := source.checkCompatible(mount); err != nil { return err } @@ -912,7 +950,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun // addRestoreMount adds a mount to the MountSources map used for restoring a // checkpointed container. -func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error { +func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m *specs.Mount) error { fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) if err != nil { return err @@ -960,7 +998,8 @@ func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.Re // Add submounts. var tmpMounted bool - for _, m := range c.mounts { + for i := range c.mounts { + m := &c.mounts[i] if err := c.addRestoreMount(conf, renv, m); err != nil { return nil, err } @@ -975,7 +1014,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.Re Type: tmpfsvfs2.Name, Destination: "/tmp", } - if err := c.addRestoreMount(conf, renv, tmpMount); err != nil { + if err := c.addRestoreMount(conf, renv, &tmpMount); err != nil { return nil, err } } @@ -1034,7 +1073,7 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mn // another user. This is normally done for /tmp. Options: []string{"mode=01777"}, } - return c.mountSubmount(ctx, conf, mns, root, tmpMount) + return c.mountSubmount(ctx, conf, mns, root, &tmpMount) default: return err diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go index b4f12d034..09ffda628 100644 --- a/runsc/boot/fs_test.go +++ b/runsc/boot/fs_test.go @@ -244,7 +244,7 @@ func TestGetMountAccessType(t *testing.T) { } mounter := containerMounter{hints: podHints} conf := &config.Config{FileAccessMounts: config.FileAccessShared} - if got := mounter.getMountAccessType(conf, specs.Mount{Source: source}); got != tst.want { + if got := mounter.getMountAccessType(conf, &specs.Mount{Source: source}); got != tst.want { t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got) } }) diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 774621970..10f2d3d35 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -29,10 +29,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/refsvfs2" @@ -216,6 +218,8 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("setting up memory usage: %v", err) } + metric.CreateSentryMetrics() + // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true @@ -226,6 +230,33 @@ func New(args Args) (*Loader, error) { vfs2.Override() } + // Make host FDs stable between invocations. Host FDs must map to the exact + // same number when the sandbox is restored. Otherwise the wrong FD will be + // used. + info := containerInfo{} + newfd := startingStdioFD + + for _, stdioFD := range args.StdioFDs { + // Check that newfd is unused to avoid clobbering over it. + if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { + if err != nil { + return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) + } + return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) + } + + err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("dup3 of stdios failed: %w", err) + } + info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) + _ = unix.Close(stdioFD) + newfd++ + } + for _, goferFD := range args.GoferFDs { + info.goferFDs = append(info.goferFDs, fd.New(goferFD)) + } + // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { @@ -345,6 +376,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("creating init process for root container: %v", err) } + info.procArgs = procArgs if err := initCompatLogs(args.UserLogFD); err != nil { return nil, fmt.Errorf("initializing compat logs: %v", err) @@ -355,6 +387,9 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("creating pod mount hints: %v", err) } + info.conf = args.Conf + info.spec = args.Spec + if kernel.VFS2Enabled { // Set up host mount that will be used for imported fds. hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS()) @@ -369,37 +404,6 @@ func New(args Args) (*Loader, error) { k.SetHostMount(hostMount) } - info := containerInfo{ - conf: args.Conf, - spec: args.Spec, - procArgs: procArgs, - } - - // Make host FDs stable between invocations. Host FDs must map to the exact - // same number when the sandbox is restored. Otherwise the wrong FD will be - // used. - newfd := startingStdioFD - for _, stdioFD := range args.StdioFDs { - // Check that newfd is unused to avoid clobbering over it. - if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { - if err != nil { - return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) - } - return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) - } - - err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) - if err != nil { - return nil, fmt.Errorf("dup3 of stdios failed: %w", err) - } - info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) - _ = unix.Close(stdioFD) - newfd++ - } - for _, goferFD := range args.GoferFDs { - info.goferFDs = append(info.goferFDs, fd.New(goferFD)) - } - eid := execID{cid: args.ID} l := &Loader{ k: k, @@ -491,10 +495,6 @@ func (l *Loader) Destroy() { // save/restore. l.k.Release() - // All sentry-created resources should have been released at this point; - // check for reference leaks. - refsvfs2.DoLeakCheck() - // In the success case, stdioFDs and goferFDs will only contain // released/closed FDs that ownership has been passed over to host FDs and // gofer sessions. Close them here in case of failure. @@ -752,7 +752,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn // Setup the child container file system. l.startGoferMonitor(cid, info.goferFDs) - mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints, kernel.VFS2Enabled) + mntr := newContainerMounter(info, l.k, l.mountHints, kernel.VFS2Enabled) if root { if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { return nil, nil, nil, err @@ -1000,6 +1000,15 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { // consider the container exited. ws := l.wait(tg) *waitStatus = ws + + // Check for leaks and write coverage report after the root container has + // exited. This guarantees that the report is written in cases where the + // sandbox is killed by a signal after the ContainerWait request is completed. + if l.root.procArgs.ContainerID == cid { + // All sentry-created resources should have been released at this point. + refsvfs2.DoLeakCheck() + coverage.Report() + } return nil } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 8b39bc59a..93c476971 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -439,7 +439,13 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{}, false /* vfs2Enabled */) + info := containerInfo{ + conf: conf, + spec: &tc.spec, + goferFDs: []*fd.FD{fd.New(sandEnd)}, + } + + mntr := newContainerMounter(&info, nil, &podMountHints{}, false /* vfs2Enabled */) mns, err := mntr.createMountNamespace(ctx, conf) if err != nil { t.Fatalf("failed to create mount namespace: %v", err) @@ -479,7 +485,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { defer l.Destroy() defer loaderCleanup() - mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints, true /* vfs2Enabled */) + mntr := newContainerMounter(&l.root, l.k, l.mountHints, true /* vfs2Enabled */) if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil { t.Fatalf("failed process hints: %v", err) } @@ -702,7 +708,12 @@ func TestRestoreEnvironment(t *testing.T) { for _, ioFD := range tc.ioFDs { ioFDs = append(ioFDs, fd.New(ioFD)) } - mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{}, false /* vfs2Enabled */) + info := containerInfo{ + conf: conf, + spec: tc.spec, + goferFDs: ioFDs, + } + mntr := newContainerMounter(&info, nil, &podMountHints{}, false /* vfs2Enabled */) actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 9b3dacf46..c1828bd3d 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -16,6 +16,7 @@ package boot import ( "fmt" + "path" "sort" "strings" @@ -29,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" "gvisor.dev/gvisor/pkg/sentry/devices/tundev" "gvisor.dev/gvisor/pkg/sentry/fs/user" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/cgroupfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" @@ -37,12 +39,14 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/verity" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/specutils" ) func registerFilesystems(k *kernel.Kernel) error { @@ -50,6 +54,10 @@ func registerFilesystems(k *kernel.Kernel) error { creds := auth.NewRootCredentials(k.RootUserNamespace()) vfsObj := k.VFS() + vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, // TODO(b/29356795): Users may mount this once the terminals are in a @@ -60,6 +68,10 @@ func registerFilesystems(k *kernel.Kernel) error { AllowUserMount: true, AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, }) @@ -79,9 +91,9 @@ func registerFilesystems(k *kernel.Kernel) error { AllowUserMount: true, AllowUserList: true, }) - vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, + vfsObj.MustRegisterFilesystemType(verity.Name, &verity.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, + AllowUserMount: true, }) // Setup files in devtmpfs. @@ -351,33 +363,33 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config. for i := range mounts { submount := &mounts[i] - log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options) var ( mnt *vfs.Mount err error ) - if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { - mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint) + if hint := c.hints.findMount(submount.mount); hint != nil && hint.isSupported() { + mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.mount, hint) if err != nil { - return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.mount.Destination, err) } } else { mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount) if err != nil { - return fmt.Errorf("mount submount %q: %w", submount.Destination, err) + return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err) } } if mnt != nil && mnt.ReadOnly() { // Switch to ReadWrite while we setup submounts. if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { - return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err) + return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err) } // Restore back to ReadOnly at the end. defer func() { if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { - panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err)) + panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err)) } }() } @@ -390,8 +402,8 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config. } type mountAndFD struct { - specs.Mount - fd int + mount *specs.Mount + fd int } func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { @@ -399,15 +411,18 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { // undocumented assumption that FDs are dispensed in the order in which // they are required by mounts. var mounts []mountAndFD - for _, m := range c.mounts { - fd := -1 + for i := range c.mounts { + m := &c.mounts[i] + specutils.MaybeConvertToBindMount(m) + // Only bind mounts use host FDs; see // containerMounter.getMountNameAndOptionsVFS2. + fd := -1 if m.Type == bind { fd = c.fds.remove() } mounts = append(mounts, mountAndFD{ - Mount: m, + mount: m, fd: fd, }) } @@ -417,7 +432,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { // Sort the mounts so that we don't place children before parents. sort.Slice(mounts, func(i, j int) bool { - return len(mounts[i].Destination) < len(mounts[j].Destination) + return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination) }) return mounts, nil @@ -433,16 +448,16 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C return nil, nil } - if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil { - return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err) + if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err) } if useOverlay { - log.Infof("Adding overlay on top of mount %q", submount.Destination) + log.Infof("Adding overlay on top of mount %q", submount.mount.Destination) var cleanup func() opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) if err != nil { - return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err) + return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err) } defer cleanup() fsName = overlay.Name @@ -454,26 +469,34 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C target := &vfs.PathOperation{ Root: root, Start: root, - Path: fspath.Parse(submount.Destination), + Path: fspath.Parse(submount.mount.Destination), } mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) if err != nil { - return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts) } - log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data) + log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data) return mnt, nil } // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) { - fsName := m.Type + fsName := m.mount.Type useOverlay := false - var data []string - var iopts interface{} + var ( + data []string + internalData interface{} + ) + + verityData, verityOpts, verityRequested, remainingMOpts, err := parseVerityMountOptions(m.mount.Options) + if err != nil { + return "", nil, false, err + } + m.mount.Options = remainingMOpts // Find filesystem name and FS specific data field. - switch m.Type { + switch m.mount.Type { case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: // Nothing to do. @@ -482,7 +505,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo case tmpfs.Name: var err error - data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) + data, err = parseAndFilterOptions(m.mount.Options, tmpfsAllowedData...) if err != nil { return "", nil, false, err } @@ -494,28 +517,35 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo // but unlikely to be correct in this context. return "", nil, false, fmt.Errorf("9P mount requires a connection FD") } - data = p9MountData(m.fd, c.getMountAccessType(conf, m.Mount), true /* vfs2 */) - iopts = gofer.InternalFilesystemOptions{ - UniqueID: m.Destination, + data = p9MountData(m.fd, c.getMountAccessType(conf, m.mount), true /* vfs2 */) + internalData = gofer.InternalFilesystemOptions{ + UniqueID: m.mount.Destination, } // If configured, add overlay to all writable mounts. - useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + useOverlay = conf.Overlay && !mountFlags(m.mount.Options).ReadOnly + + case cgroupfs.Name: + var err error + data, err = parseAndFilterOptions(m.mount.Options, cgroupfs.SupportedMountOptions...) + if err != nil { + return "", nil, false, err + } default: - log.Warningf("ignoring unknown filesystem type %q", m.Type) + log.Warningf("ignoring unknown filesystem type %q", m.mount.Type) return "", nil, false, nil } opts := &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ Data: strings.Join(data, ","), - InternalData: iopts, + InternalData: internalData, }, InternalMount: true, } - for _, o := range m.Options { + for _, o := range m.mount.Options { switch o { case "rw": opts.ReadOnly = false @@ -525,14 +555,82 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo opts.Flags.NoATime = true case "noexec": opts.Flags.NoExec = true + case "bind", "rbind": + // These are the same as a mount with type="bind". default: log.Warningf("ignoring unknown mount option %q", o) } } + if verityRequested { + verityData = verityData + "root_name=" + path.Base(m.mount.Destination) + verityOpts.LowerName = fsName + verityOpts.LowerGetFSOptions = opts.GetFilesystemOptions + fsName = verity.Name + opts = &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: verityData, + InternalData: verityOpts, + }, + InternalMount: true, + } + } + return fsName, opts, useOverlay, nil } +func parseKeyValue(s string) (string, string, bool) { + tokens := strings.SplitN(s, "=", 2) + if len(tokens) < 2 { + return "", "", false + } + return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true +} + +// parseAndFilterOptions scans the provided mount options for verity-related +// mount options. It returns the parsed set of verity mount options, as well as +// the filtered set of mount options unrelated to verity. +func parseVerityMountOptions(mopts []string) (string, verity.InternalFilesystemOptions, bool, []string, error) { + nonVerity := []string{} + found := false + var rootHash string + verityOpts := verity.InternalFilesystemOptions{ + Action: verity.PanicOnViolation, + } + for _, o := range mopts { + if !strings.HasPrefix(o, "verity.") { + nonVerity = append(nonVerity, o) + continue + } + + k, v, ok := parseKeyValue(o) + if !ok { + return "", verityOpts, found, nonVerity, fmt.Errorf("invalid verity mount option with no value: %q", o) + } + + found = true + switch k { + case "verity.roothash": + rootHash = v + case "verity.action": + switch v { + case "error": + verityOpts.Action = verity.ErrorOnViolation + case "panic": + verityOpts.Action = verity.PanicOnViolation + default: + log.Warningf("Invalid verity action %q", v) + verityOpts.Action = verity.PanicOnViolation + } + default: + return "", verityOpts, found, nonVerity, fmt.Errorf("unknown verity mount option: %q", k) + } + } + verityOpts.AllowRuntimeEnable = len(rootHash) == 0 + verityData := "root_hash=" + rootHash + "," + return verityData, verityOpts, found, nonVerity, nil +} + // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. // Technically we don't have to mount tmpfs at /tmp, as we could just rely on // the host /tmp, but this is a nice optimization, and fixes some apps that call @@ -594,7 +692,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config // another user. This is normally done for /tmp. Options: []string{"mode=01777"}, } - _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{mount: &tmpMount}) return err case syserror.ENOTDIR: @@ -633,7 +731,7 @@ func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Cre func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. - mntFD := &mountAndFD{Mount: hint.mount} + mntFD := &mountAndFD{mount: &hint.mount} fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD) if err != nil { return nil, err @@ -643,11 +741,11 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf } if useOverlay { - log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination) + log.Infof("Adding overlay on top of shared mount %q", mntFD.mount.Destination) var cleanup func() opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) if err != nil { - return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err) + return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.mount.Destination, err) } defer cleanup() fsName = overlay.Name @@ -658,14 +756,14 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf // mountSharedSubmount binds mount to a previously mounted volume that is shared // among containers in the same pod. -func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) { +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount *specs.Mount, source *mountHint) (*vfs.Mount, error) { if err := source.checkCompatible(mount); err != nil { return nil, err } // Ignore data and useOverlay because these were already applied to // the master mount. - _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{mount: mount}) if err != nil { return nil, err } @@ -718,7 +816,7 @@ func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Crede // configureRestore returns an updated context.Context including filesystem // state used by restore defined by conf. -func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) { +func (c *containerMounter) configureRestore(ctx context.Context) (context.Context, error) { fdmap := make(map[string]int) fdmap["/"] = c.fds.remove() mounts, err := c.prepareMountsVFS2() @@ -728,7 +826,7 @@ func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Co for i := range c.mounts { submount := &mounts[i] if submount.fd >= 0 { - fdmap[submount.Destination] = submount.fd + fdmap[submount.mount.Destination] = submount.fd } } return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index 438b7ef3e..335e46499 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -40,23 +40,24 @@ const ( cgroupRoot = "/sys/fs/cgroup" ) -var controllers = map[string]config{ - "blkio": {ctrlr: &blockIO{}}, - "cpu": {ctrlr: &cpu{}}, - "cpuset": {ctrlr: &cpuSet{}}, - "hugetlb": {ctrlr: &hugeTLB{}, optional: true}, - "memory": {ctrlr: &memory{}}, - "net_cls": {ctrlr: &networkClass{}}, - "net_prio": {ctrlr: &networkPrio{}}, - "pids": {ctrlr: &pids{}}, +var controllers = map[string]controller{ + "blkio": &blockIO{}, + "cpu": &cpu{}, + "cpuset": &cpuSet{}, + "hugetlb": &hugeTLB{}, + "memory": &memory{}, + "net_cls": &networkClass{}, + "net_prio": &networkPrio{}, + "pids": &pids{}, // These controllers either don't have anything in the OCI spec or is // irrelevant for a sandbox. - "devices": {ctrlr: &noop{}}, - "freezer": {ctrlr: &noop{}}, - "perf_event": {ctrlr: &noop{}}, - "rdma": {ctrlr: &noop{}, optional: true}, - "systemd": {ctrlr: &noop{}}, + "cpuacct": &noop{}, + "devices": &noop{}, + "freezer": &noop{}, + "perf_event": &noop{}, + "rdma": &noop{isOptional: true}, + "systemd": &noop{}, } // IsOnlyV2 checks whether cgroups V2 is enabled and V1 is not. @@ -201,31 +202,26 @@ func countCpuset(cpuset string) (int, error) { return count, nil } -// LoadPaths loads cgroup paths for given 'pid', may be set to 'self'. -func LoadPaths(pid string) (map[string]string, error) { - f, err := os.Open(filepath.Join("/proc", pid, "cgroup")) +// loadPaths loads cgroup paths for given 'pid', may be set to 'self'. +func loadPaths(pid string) (map[string]string, error) { + procCgroup, err := os.Open(filepath.Join("/proc", pid, "cgroup")) if err != nil { return nil, err } - defer f.Close() + defer procCgroup.Close() - return loadPathsHelper(f) -} - -func loadPathsHelper(cgroup io.Reader) (map[string]string, error) { - // For nested containers, in /proc/self/cgroup we see paths from host, - // which don't exist in container, so recover the container paths here by - // double-checking with /proc/pid/mountinfo - mountinfo, err := os.Open("/proc/self/mountinfo") + // Load mountinfo for the current process, because it's where cgroups is + // being accessed from. + mountinfo, err := os.Open(filepath.Join("/proc/self/mountinfo")) if err != nil { return nil, err } defer mountinfo.Close() - return loadPathsHelperWithMountinfo(cgroup, mountinfo) + return loadPathsHelper(procCgroup, mountinfo) } -func loadPathsHelperWithMountinfo(cgroup, mountinfo io.Reader) (map[string]string, error) { +func loadPathsHelper(cgroup, mountinfo io.Reader) (map[string]string, error) { paths := make(map[string]string) scanner := bufio.NewScanner(cgroup) @@ -242,34 +238,51 @@ func loadPathsHelperWithMountinfo(cgroup, mountinfo io.Reader) (map[string]strin for _, ctrlr := range strings.Split(tokens[1], ",") { // Remove prefix for cgroups with no controller, eg. systemd. ctrlr = strings.TrimPrefix(ctrlr, "name=") - paths[ctrlr] = tokens[2] + // Discard unknown controllers. + if _, ok := controllers[ctrlr]; ok { + paths[ctrlr] = tokens[2] + } } } if err := scanner.Err(); err != nil { return nil, err } - mfScanner := bufio.NewScanner(mountinfo) - for mfScanner.Scan() { - txt := mfScanner.Text() - fields := strings.Fields(txt) + // For nested containers, in /proc/[pid]/cgroup we see paths from host, + // which don't exist in container, so recover the container paths here by + // double-checking with /proc/[pid]/mountinfo + mountScanner := bufio.NewScanner(mountinfo) + for mountScanner.Scan() { + // Format: ID parent major:minor root mount-point options opt-fields - fs-type source super-options + // Example: 39 32 0:34 / /sys/fs/cgroup/devices rw,noexec shared:18 - cgroup cgroup rw,devices + fields := strings.Fields(mountScanner.Text()) if len(fields) < 9 || fields[len(fields)-3] != "cgroup" { + // Skip mounts that are not cgroup mounts. continue } - for _, opt := range strings.Split(fields[len(fields)-1], ",") { + // Cgroup controller type is in the super-options field. + superOptions := strings.Split(fields[len(fields)-1], ",") + for _, opt := range superOptions { // Remove prefix for cgroups with no controller, eg. systemd. opt = strings.TrimPrefix(opt, "name=") + + // Only considers cgroup controllers that are registered, and skip other + // irrelevant options, e.g. rw. if cgroupPath, ok := paths[opt]; ok { - root := fields[3] - relCgroupPath, err := filepath.Rel(root, cgroupPath) - if err != nil { - return nil, err + rootDir := fields[3] + if rootDir != "/" { + // When cgroup is in submount, remove repeated path components from + // cgroup path to avoid duplicating them. + relCgroupPath, err := filepath.Rel(rootDir, cgroupPath) + if err != nil { + return nil, err + } + paths[opt] = relCgroupPath } - paths[opt] = relCgroupPath } } } - if err := mfScanner.Err(); err != nil { + if err := mountScanner.Err(); err != nil { return nil, err } @@ -279,37 +292,50 @@ func loadPathsHelperWithMountinfo(cgroup, mountinfo io.Reader) (map[string]strin // Cgroup represents a group inside all controllers. For example: // Name='/foo/bar' maps to /sys/fs/cgroup/<controller>/foo/bar on // all controllers. +// +// If Name is relative, it uses the parent cgroup path to determine the +// location. For example: +// Name='foo/bar' and Parent[ctrl]="/user.slice", then it will map to +// /sys/fs/cgroup/<ctrl>/user.slice/foo/bar type Cgroup struct { Name string `json:"name"` Parents map[string]string `json:"parents"` Own map[string]bool `json:"own"` } -// New creates a new Cgroup instance if the spec includes a cgroup path. -// Returns nil otherwise. -func New(spec *specs.Spec) (*Cgroup, error) { +// NewFromSpec creates a new Cgroup instance if the spec includes a cgroup path. +// Returns nil otherwise. Cgroup paths are loaded based on the current process. +func NewFromSpec(spec *specs.Spec) (*Cgroup, error) { if spec.Linux == nil || spec.Linux.CgroupsPath == "" { return nil, nil } - return NewFromPath(spec.Linux.CgroupsPath) + return new("self", spec.Linux.CgroupsPath) } -// NewFromPath creates a new Cgroup instance. -func NewFromPath(cgroupsPath string) (*Cgroup, error) { +// NewFromPid loads cgroup for the given process. +func NewFromPid(pid int) (*Cgroup, error) { + return new(strconv.Itoa(pid), "") +} + +func new(pid, cgroupsPath string) (*Cgroup, error) { var parents map[string]string + + // If path is relative, load cgroup paths for the process to build the + // relative paths. if !filepath.IsAbs(cgroupsPath) { var err error - parents, err = LoadPaths("self") + parents, err = loadPaths(pid) if err != nil { return nil, fmt.Errorf("finding current cgroups: %w", err) } } - own := make(map[string]bool) - return &Cgroup{ + cg := &Cgroup{ Name: cgroupsPath, Parents: parents, - Own: own, - }, nil + Own: make(map[string]bool), + } + log.Debugf("New cgroup for pid: %s, %+v", pid, cg) + return cg, nil } // Install creates and configures cgroups according to 'res'. If cgroup path @@ -323,8 +349,8 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { clean := cleanup.Make(func() { _ = c.Uninstall() }) defer clean.Clean() - for key, cfg := range controllers { - path := c.makePath(key) + for key, ctrlr := range controllers { + path := c.MakePath(key) if _, err := os.Stat(path); err == nil { // If cgroup has already been created; it has been setup by caller. Don't // make any changes to configuration, just join when sandbox/gofer starts. @@ -336,13 +362,16 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { c.Own[key] = true if err := os.MkdirAll(path, 0755); err != nil { - if cfg.optional && errors.Is(err, unix.EROFS) { + if ctrlr.optional() && errors.Is(err, unix.EROFS) { + if err := ctrlr.skip(res); err != nil { + return err + } log.Infof("Skipping cgroup %q", key) continue } return err } - if err := cfg.ctrlr.set(res, path); err != nil { + if err := ctrlr.set(res, path); err != nil { return err } } @@ -359,7 +388,7 @@ func (c *Cgroup) Uninstall() error { // cgroup is managed by caller, don't touch it. continue } - path := c.makePath(key) + path := c.MakePath(key) log.Debugf("Removing cgroup controller for key=%q path=%q", key, path) // If we try to remove the cgroup too soon after killing the @@ -387,7 +416,7 @@ func (c *Cgroup) Uninstall() error { func (c *Cgroup) Join() (func(), error) { // First save the current state so it can be restored. undo := func() {} - paths, err := LoadPaths("self") + paths, err := loadPaths("self") if err != nil { return undo, err } @@ -414,14 +443,13 @@ func (c *Cgroup) Join() (func(), error) { } // Now join the cgroups. - for key, cfg := range controllers { - path := c.makePath(key) + for key, ctrlr := range controllers { + path := c.MakePath(key) log.Debugf("Joining cgroup %q", path) - // Writing the value 0 to a cgroup.procs file causes the - // writing process to be moved to the corresponding cgroup. - // - cgroups(7). + // Writing the value 0 to a cgroup.procs file causes the writing process to + // be moved to the corresponding cgroup - cgroups(7). if err := setValue(path, "cgroup.procs", "0"); err != nil { - if cfg.optional && os.IsNotExist(err) { + if ctrlr.optional() && os.IsNotExist(err) { continue } return undo, err @@ -432,7 +460,7 @@ func (c *Cgroup) Join() (func(), error) { // CPUQuota returns the CFS CPU quota. func (c *Cgroup) CPUQuota() (float64, error) { - path := c.makePath("cpu") + path := c.MakePath("cpu") quota, err := getInt(path, "cpu.cfs_quota_us") if err != nil { return -1, err @@ -449,7 +477,7 @@ func (c *Cgroup) CPUQuota() (float64, error) { // CPUUsage returns the total CPU usage of the cgroup. func (c *Cgroup) CPUUsage() (uint64, error) { - path := c.makePath("cpuacct") + path := c.MakePath("cpuacct") usage, err := getValue(path, "cpuacct.usage") if err != nil { return 0, err @@ -459,7 +487,7 @@ func (c *Cgroup) CPUUsage() (uint64, error) { // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. func (c *Cgroup) NumCPU() (int, error) { - path := c.makePath("cpuset") + path := c.MakePath("cpuset") cpuset, err := getValue(path, "cpuset.cpus") if err != nil { return 0, err @@ -469,7 +497,7 @@ func (c *Cgroup) NumCPU() (int, error) { // MemoryLimit returns the memory limit. func (c *Cgroup) MemoryLimit() (uint64, error) { - path := c.makePath("memory") + path := c.MakePath("memory") limStr, err := getValue(path, "memory.limit_in_bytes") if err != nil { return 0, err @@ -477,7 +505,8 @@ func (c *Cgroup) MemoryLimit() (uint64, error) { return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64) } -func (c *Cgroup) makePath(controllerName string) string { +// MakePath builds a path to the given controller. +func (c *Cgroup) MakePath(controllerName string) string { path := c.Name if parent, ok := c.Parents[controllerName]; ok { path = filepath.Join(parent, c.Name) @@ -485,22 +514,48 @@ func (c *Cgroup) makePath(controllerName string) string { return filepath.Join(cgroupRoot, controllerName, path) } -type config struct { - ctrlr controller - optional bool -} - type controller interface { + // optional controllers don't fail if not found. + optional() bool + // set applies resource limits to controller. set(*specs.LinuxResources, string) error + // skip is called when controller is not found to check if it can be safely + // skipped or not based on the spec. + skip(*specs.LinuxResources) error +} + +type noop struct { + isOptional bool } -type noop struct{} +func (n *noop) optional() bool { + return n.isOptional +} func (*noop) set(*specs.LinuxResources, string) error { return nil } -type memory struct{} +func (n *noop) skip(*specs.LinuxResources) error { + if !n.isOptional { + panic("cgroup controller is not optional") + } + return nil +} + +type mandatory struct{} + +func (*mandatory) optional() bool { + return false +} + +func (*mandatory) skip(*specs.LinuxResources) error { + panic("cgroup controller is not optional") +} + +type memory struct { + mandatory +} func (*memory) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Memory == nil { @@ -533,7 +588,9 @@ func (*memory) set(spec *specs.LinuxResources, path string) error { return nil } -type cpu struct{} +type cpu struct { + mandatory +} func (*cpu) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.CPU == nil { @@ -554,7 +611,9 @@ func (*cpu) set(spec *specs.LinuxResources, path string) error { return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime) } -type cpuSet struct{} +type cpuSet struct { + mandatory +} func (*cpuSet) set(spec *specs.LinuxResources, path string) error { // cpuset.cpus and mems are required fields, but are not set on a new cgroup. @@ -576,7 +635,9 @@ func (*cpuSet) set(spec *specs.LinuxResources, path string) error { return setValue(path, "cpuset.mems", spec.CPU.Mems) } -type blockIO struct{} +type blockIO struct { + mandatory +} func (*blockIO) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.BlockIO == nil { @@ -628,6 +689,10 @@ func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error { type networkClass struct{} +func (*networkClass) optional() bool { + return true +} + func (*networkClass) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Network == nil { return nil @@ -635,8 +700,19 @@ func (*networkClass) set(spec *specs.LinuxResources, path string) error { return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID) } +func (*networkClass) skip(spec *specs.LinuxResources) error { + if spec != nil && spec.Network != nil && spec.Network.ClassID != nil { + return fmt.Errorf("Network.ClassID set but net_cls cgroup controller not found") + } + return nil +} + type networkPrio struct{} +func (*networkPrio) optional() bool { + return true +} + func (*networkPrio) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Network == nil { return nil @@ -650,7 +726,16 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error { return nil } -type pids struct{} +func (*networkPrio) skip(spec *specs.LinuxResources) error { + if spec != nil && spec.Network != nil && len(spec.Network.Priorities) > 0 { + return fmt.Errorf("Network.Priorities set but net_prio cgroup controller not found") + } + return nil +} + +type pids struct { + mandatory +} func (*pids) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Pids == nil || spec.Pids.Limit <= 0 { @@ -662,6 +747,17 @@ func (*pids) set(spec *specs.LinuxResources, path string) error { type hugeTLB struct{} +func (*hugeTLB) optional() bool { + return true +} + +func (*hugeTLB) skip(spec *specs.LinuxResources) error { + if spec != nil && len(spec.HugepageLimits) > 0 { + return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found") + } + return nil +} + func (*hugeTLB) set(spec *specs.LinuxResources, path string) error { if spec == nil { return nil diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index 48d71cfa6..02cadeef4 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -43,19 +43,19 @@ var debianMountinfo = ` ` var dindMountinfo = ` -1305 1304 0:64 / /sys/fs/cgroup rw - tmpfs tmpfs rw,mode=755 -1306 1305 0:32 /docker/136 /sys/fs/cgroup/systemd ro master:11 - cgroup cgroup rw,xattr,name=systemd -1307 1305 0:36 /docker/136 /sys/fs/cgroup/cpu,cpuacct ro master:16 - cgroup cgroup rw,cpu,cpuacct -1308 1305 0:37 /docker/136 /sys/fs/cgroup/freezer ro master:17 - cgroup cgroup rw,freezer -1309 1305 0:38 /docker/136 /sys/fs/cgroup/hugetlb ro master:18 - cgroup cgroup rw,hugetlb -1310 1305 0:39 /docker/136 /sys/fs/cgroup/cpuset ro master:19 - cgroup cgroup rw,cpuset -1311 1305 0:40 /docker/136 /sys/fs/cgroup/net_cls,net_prio ro master:20 - cgroup cgroup rw,net_cls,net_prio -1312 1305 0:41 /docker/136 /sys/fs/cgroup/pids ro master:21 - cgroup cgroup rw,pids -1313 1305 0:42 /docker/136 /sys/fs/cgroup/perf_event ro master:22 - cgroup cgroup rw,perf_event -1314 1305 0:43 /docker/136 /sys/fs/cgroup/memory ro master:23 - cgroup cgroup rw,memory -1316 1305 0:44 /docker/136 /sys/fs/cgroup/blkio ro master:24 - cgroup cgroup rw,blkio -1317 1305 0:45 /docker/136 /sys/fs/cgroup/devices ro master:25 - cgroup cgroup rw,devices -1318 1305 0:46 / /sys/fs/cgroup/rdma ro master:26 - cgroup cgroup rw,rdma +05 04 0:64 / /sys/fs/cgroup rw - tmpfs tmpfs rw,mode=755 +06 05 0:32 /docker/136 /sys/fs/cgroup/systemd ro master:11 - cgroup cgroup rw,xattr,name=systemd +07 05 0:36 /docker/136 /sys/fs/cgroup/cpu,cpuacct ro master:16 - cgroup cgroup rw,cpu,cpuacct +08 05 0:37 /docker/136 /sys/fs/cgroup/freezer ro master:17 - cgroup cgroup rw,freezer +09 05 0:38 /docker/136 /sys/fs/cgroup/hugetlb ro master:18 - cgroup cgroup rw,hugetlb +10 05 0:39 /docker/136 /sys/fs/cgroup/cpuset ro master:19 - cgroup cgroup rw,cpuset +11 05 0:40 /docker/136 /sys/fs/cgroup/net_cls,net_prio ro master:20 - cgroup cgroup rw,net_cls,net_prio +12 05 0:41 /docker/136 /sys/fs/cgroup/pids ro master:21 - cgroup cgroup rw,pids +13 05 0:42 /docker/136 /sys/fs/cgroup/perf_event ro master:22 - cgroup cgroup rw,perf_event +14 05 0:43 /docker/136 /sys/fs/cgroup/memory ro master:23 - cgroup cgroup rw,memory +16 05 0:44 /docker/136 /sys/fs/cgroup/blkio ro master:24 - cgroup cgroup rw,blkio +17 05 0:45 /docker/136 /sys/fs/cgroup/devices ro master:25 - cgroup cgroup rw,devices +18 05 0:46 / /sys/fs/cgroup/rdma ro master:26 - cgroup cgroup rw,rdma ` func TestUninstallEnoent(t *testing.T) { @@ -693,36 +693,42 @@ func TestLoadPaths(t *testing.T) { err string }{ { - name: "abs-path-unknown-controller", - cgroups: "0:ctr:/path", + name: "empty", mountinfo: debianMountinfo, - want: map[string]string{"ctr": "/path"}, + }, + { + name: "abs-path", + cgroups: "0:cpu:/path", + mountinfo: debianMountinfo, + want: map[string]string{"cpu": "/path"}, }, { name: "rel-path", - cgroups: "0:ctr:rel-path", + cgroups: "0:cpu:rel-path", mountinfo: debianMountinfo, - want: map[string]string{"ctr": "rel-path"}, + want: map[string]string{"cpu": "rel-path"}, }, { name: "non-controller", cgroups: "0:name=systemd:/path", mountinfo: debianMountinfo, - want: map[string]string{"systemd": "path"}, + want: map[string]string{"systemd": "/path"}, }, { - name: "empty", + name: "unknown-controller", + cgroups: "0:ctr:/path", mountinfo: debianMountinfo, + want: map[string]string{}, }, { name: "multiple", - cgroups: "0:ctr0:/path0\n" + - "1:ctr1:/path1\n" + + cgroups: "0:cpu:/path0\n" + + "1:memory:/path1\n" + "2::/empty\n", mountinfo: debianMountinfo, want: map[string]string{ - "ctr0": "/path0", - "ctr1": "/path1", + "cpu": "/path0", + "memory": "/path1", }, }, { @@ -747,10 +753,10 @@ func TestLoadPaths(t *testing.T) { }, { name: "nested-cgroup", - cgroups: `9:memory:/docker/136 -2:cpu,cpuacct:/docker/136 -1:name=systemd:/docker/136 -0::/system.slice/containerd.service`, + cgroups: "9:memory:/docker/136\n" + + "2:cpu,cpuacct:/docker/136\n" + + "1:name=systemd:/docker/136\n" + + "0::/system.slice/containerd.service\n", mountinfo: dindMountinfo, // we want relative path to /sys/fs/cgroup inside the nested container. // Subcroup inside the container will be created at /sys/fs/cgroup/cpu @@ -781,15 +787,15 @@ func TestLoadPaths(t *testing.T) { }, { name: "invalid-rel-path-in-proc-cgroup", - cgroups: "9:memory:./invalid", + cgroups: "9:memory:invalid", mountinfo: dindMountinfo, - err: "can't make ./invalid relative to /docker/136", + err: "can't make invalid relative to /docker/136", }, } { t.Run(tc.name, func(t *testing.T) { r := strings.NewReader(tc.cgroups) mountinfo := strings.NewReader(tc.mountinfo) - got, err := loadPathsHelperWithMountinfo(r, mountinfo) + got, err := loadPathsHelper(r, mountinfo) if len(tc.err) == 0 { if err != nil { t.Fatalf("Unexpected error: %v", err) @@ -813,3 +819,47 @@ func TestLoadPaths(t *testing.T) { }) } } + +func TestOptional(t *testing.T) { + for _, tc := range []struct { + name string + ctrlr controller + spec *specs.LinuxResources + err string + }{ + { + name: "net-cls", + ctrlr: &networkClass{}, + spec: &specs.LinuxResources{Network: &specs.LinuxNetwork{ClassID: uint32Ptr(1)}}, + err: "Network.ClassID set but net_cls cgroup controller not found", + }, + { + name: "net-prio", + ctrlr: &networkPrio{}, + spec: &specs.LinuxResources{Network: &specs.LinuxNetwork{ + Priorities: []specs.LinuxInterfacePriority{ + {Name: "foo", Priority: 1}, + }, + }}, + err: "Network.Priorities set but net_prio cgroup controller not found", + }, + { + name: "hugetlb", + ctrlr: &hugeTLB{}, + spec: &specs.LinuxResources{HugepageLimits: []specs.LinuxHugepageLimit{ + {Pagesize: "1", Limit: 2}, + }}, + err: "HugepageLimits set but hugetlb cgroup controller not found", + }, + } { + t.Run(tc.name, func(t *testing.T) { + err := tc.ctrlr.skip(tc.spec) + if err == nil { + t.Fatalf("ctrlr.skip() didn't fail") + } + if !strings.Contains(err.Error(), tc.err) { + t.Errorf("ctrlr.skip() want: *%s*, got: %q", tc.err, err) + } + }) + } +} diff --git a/runsc/cli/BUILD b/runsc/cli/BUILD index f1e3cce68..360e3cea6 100644 --- a/runsc/cli/BUILD +++ b/runsc/cli/BUILD @@ -10,8 +10,10 @@ go_library( "//runsc:__pkg__", ], deps = [ + "//pkg/coverage", "//pkg/log", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/sentry/platform", "//runsc/cmd", "//runsc/config", diff --git a/runsc/cli/main.go b/runsc/cli/main.go index a3c515f4b..76184cd9c 100644 --- a/runsc/cli/main.go +++ b/runsc/cli/main.go @@ -27,8 +27,10 @@ import ( "github.com/google/subcommands" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/cmd" "gvisor.dev/gvisor/runsc/config" @@ -50,6 +52,7 @@ var ( logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.") + coverageFD = flag.Int("coverage-fd", -1, "file descriptor to write Go coverage output.") ) // Main is the main entrypoint. @@ -86,6 +89,7 @@ func Main(version string) { subcommands.Register(new(cmd.Symbolize), "") subcommands.Register(new(cmd.Wait), "") subcommands.Register(new(cmd.Mitigate), "") + subcommands.Register(new(cmd.VerityPrepare), "") // Register internal commands with the internal group name. This causes // them to be sorted below the user-facing commands with empty group. @@ -204,6 +208,10 @@ func Main(version string) { } else if conf.AlsoLogToStderr { e = &log.MultiEmitter{e, newEmitter(conf.DebugLogFormat, os.Stderr)} } + if *coverageFD >= 0 { + f := os.NewFile(uintptr(*coverageFD), "coverage file") + coverage.EnableReport(f) + } log.SetTarget(e) @@ -233,6 +241,9 @@ func Main(version string) { // Call the subcommand and pass in the configuration. var ws unix.WaitStatus subcmdCode := subcommands.Execute(context.Background(), conf, &ws) + // Check for leaks and write coverage report before os.Exit(). + refsvfs2.DoLeakCheck() + coverage.Report() if subcmdCode == subcommands.ExitSuccess { log.Infof("Exiting with status: %v", ws) if ws.Signaled() { diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 2c3b4058b..39c8ff603 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -23,6 +23,7 @@ go_library( "kill.go", "list.go", "mitigate.go", + "mitigate_extras.go", "path.go", "pause.go", "ps.go", @@ -35,6 +36,7 @@ go_library( "statefile.go", "symbolize.go", "syscalls.go", + "verity_prepare.go", "wait.go", ], visibility = [ diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 455c57692..5485db149 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -126,9 +126,8 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su Hostname: hostname, } - specutils.LogSpec(spec) - cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000)) + if conf.Network == config.NetworkNone { addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace}) @@ -154,55 +153,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su } } - out, err := json.Marshal(spec) - if err != nil { - return Errorf("Error to marshal spec: %v", err) - } - tmpDir, err := ioutil.TempDir("", "runsc-do") - if err != nil { - return Errorf("Error to create tmp dir: %v", err) - } - defer os.RemoveAll(tmpDir) - - log.Infof("Changing configuration RootDir to %q", tmpDir) - conf.RootDir = tmpDir - - cfgPath := filepath.Join(tmpDir, "config.json") - if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil { - return Errorf("Error write spec: %v", err) - } - - containerArgs := container.Args{ - ID: cid, - Spec: spec, - BundleDir: tmpDir, - Attached: true, - } - ct, err := container.New(conf, containerArgs) - if err != nil { - return Errorf("creating container: %v", err) - } - defer ct.Destroy() - - if err := ct.Start(conf); err != nil { - return Errorf("starting container: %v", err) - } - - // Forward signals to init in the container. Thus if we get SIGINT from - // ^C, the container gracefully exit, and we can clean up. - // - // N.B. There is a still a window before this where a signal may kill - // this process, skipping cleanup. - stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */) - defer stopForwarding() - - ws, err := ct.Wait() - if err != nil { - return Errorf("waiting for container: %v", err) - } - - *waitStatus = ws - return subcommands.ExitSuccess + return startContainerAndWait(spec, conf, cid, waitStatus) } func addNamespace(spec *specs.Spec, ns specs.LinuxNamespace) { @@ -397,3 +348,58 @@ func calculatePeerIP(ip string) (string, error) { } return fmt.Sprintf("%s.%s.%s.%d", parts[0], parts[1], parts[2], n), nil } + +func startContainerAndWait(spec *specs.Spec, conf *config.Config, cid string, waitStatus *unix.WaitStatus) subcommands.ExitStatus { + specutils.LogSpec(spec) + + out, err := json.Marshal(spec) + if err != nil { + return Errorf("Error to marshal spec: %v", err) + } + tmpDir, err := ioutil.TempDir("", "runsc-do") + if err != nil { + return Errorf("Error to create tmp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + log.Infof("Changing configuration RootDir to %q", tmpDir) + conf.RootDir = tmpDir + + cfgPath := filepath.Join(tmpDir, "config.json") + if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil { + return Errorf("Error write spec: %v", err) + } + + containerArgs := container.Args{ + ID: cid, + Spec: spec, + BundleDir: tmpDir, + Attached: true, + } + + ct, err := container.New(conf, containerArgs) + if err != nil { + return Errorf("creating container: %v", err) + } + defer ct.Destroy() + + if err := ct.Start(conf); err != nil { + return Errorf("starting container: %v", err) + } + + // Forward signals to init in the container. Thus if we get SIGINT from + // ^C, the container gracefully exit, and we can clean up. + // + // N.B. There is a still a window before this where a signal may kill + // this process, skipping cleanup. + stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */) + defer stopForwarding() + + ws, err := ct.Wait() + if err != nil { + return Errorf("waiting for container: %v", err) + } + + *waitStatus = ws + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 4cb0164dd..6a755ecb6 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -176,7 +176,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) mountIdx := 1 // first one is the root for _, m := range spec.Mounts { - if specutils.Is9PMount(m) { + if specutils.Is9PMount(m, conf.VFS2) { cfg := fsgofer.Config{ ROMount: isReadonlyMount(m.Options) || conf.Overlay, HostUDS: conf.FSGoferHostUDS, @@ -350,7 +350,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { // creates directories as needed. func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error { for _, m := range mounts { - if m.Type != "bind" || !specutils.IsVFS1SupportedDevMount(m) { + if !specutils.Is9PMount(m, conf.VFS2) { continue } @@ -390,7 +390,7 @@ func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error { func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { cleanMounts := make([]specs.Mount, 0, len(mounts)) for _, m := range mounts { - if m.Type != "bind" || !specutils.IsVFS1SupportedDevMount(m) { + if !specutils.Is9PMount(m, conf.VFS2) { cleanMounts = append(cleanMounts, m) continue } diff --git a/runsc/cmd/mitigate.go b/runsc/cmd/mitigate.go index 720141aa5..f4e65adb8 100644 --- a/runsc/cmd/mitigate.go +++ b/runsc/cmd/mitigate.go @@ -41,8 +41,8 @@ type Mitigate struct { reverse bool // Path to file to read to create CPUSet. path string - // Callback to check if a given thread is vulnerable. - vulnerable func(other mitigate.Thread) bool + // Extra data for post mitigate operations. + data string } // Name implements subcommands.command.name. @@ -55,19 +55,20 @@ func (*Mitigate) Synopsis() string { return "mitigate mitigates the underlying system against side channel attacks" } -// Usage implments Usage for cmd.Mitigate. +// Usage implements Usage for cmd.Mitigate. func (m Mitigate) Usage() string { - return `mitigate [flags] + return fmt.Sprintf(`mitigate [flags] mitigate mitigates a system to the "MDS" vulnerability by implementing a manual shutdown of SMT. The command checks /proc/cpuinfo for cpus having the MDS vulnerability, and if found, shutdown all but one CPU per hyperthread pair via /sys/devices/system/cpu/cpu{N}/online. CPUs can be restored by writing "2" to each file in /sys/devices/system/cpu/cpu{N}/online or performing a system reboot. -The command can be reversed with --reverse, which reads the total CPUs from /sys/devices/system/cpu/possible and enables all with /sys/devices/system/cpu/cpu{N}/online.` +The command can be reversed with --reverse, which reads the total CPUs from /sys/devices/system/cpu/possible and enables all with /sys/devices/system/cpu/cpu{N}/online.%s`, m.usage()) } // SetFlags sets flags for the command Mitigate. func (m *Mitigate) SetFlags(f *flag.FlagSet) { f.BoolVar(&m.dryRun, "dryrun", false, "run the command without changing system") f.BoolVar(&m.reverse, "reverse", false, "reverse mitigate by enabling all CPUs") + m.setFlags(f) } // Execute implements subcommands.Command.Execute. @@ -87,13 +88,17 @@ func (m *Mitigate) Execute(_ context.Context, f *flag.FlagSet, args ...interface m.path = allPossibleCPUs } - m.vulnerable = func(other mitigate.Thread) bool { - return other.IsVulnerable() + set, err := m.doExecute() + if err != nil { + return Errorf("Execute failed: %v", err) } - if _, err := m.doExecute(); err != nil { - log.Warningf("Execute failed: %v", err) - return subcommands.ExitFailure + if m.data == "" { + return subcommands.ExitSuccess + } + + if err = m.postMitigate(set); err != nil { + return Errorf("Post Mitigate failed: %v", err) } return subcommands.ExitSuccess @@ -104,32 +109,26 @@ func (m *Mitigate) doExecute() (mitigate.CPUSet, error) { if m.dryRun { log.Infof("Running with DryRun. No cpu settings will be changed.") } + data, err := ioutil.ReadFile(m.path) + if err != nil { + return nil, fmt.Errorf("failed to read %s: %w", m.path, err) + } if m.reverse { - data, err := ioutil.ReadFile(m.path) - if err != nil { - return nil, fmt.Errorf("failed to read %s: %v", m.path, err) - } - set, err := m.doReverse(data) if err != nil { - return nil, fmt.Errorf("reverse operation failed: %v", err) + return nil, fmt.Errorf("reverse operation failed: %w", err) } return set, nil } - - data, err := ioutil.ReadFile(m.path) - if err != nil { - return nil, fmt.Errorf("failed to read %s: %v", m.path, err) - } set, err := m.doMitigate(data) if err != nil { - return nil, fmt.Errorf("mitigate operation failed: %v", err) + return nil, fmt.Errorf("mitigate operation failed: %w", err) } return set, nil } func (m *Mitigate) doMitigate(data []byte) (mitigate.CPUSet, error) { - set, err := mitigate.NewCPUSet(data, m.vulnerable) + set, err := mitigate.NewCPUSet(data) if err != nil { return nil, err } @@ -145,7 +144,7 @@ func (m *Mitigate) doMitigate(data []byte) (mitigate.CPUSet, error) { continue } if err := t.Disable(); err != nil { - return nil, fmt.Errorf("error disabling thread: %s err: %v", t, err) + return nil, fmt.Errorf("error disabling thread: %s err: %w", t, err) } } log.Infof("Shutdown successful.") @@ -170,7 +169,7 @@ func (m *Mitigate) doReverse(data []byte) (mitigate.CPUSet, error) { continue } if err := t.Enable(); err != nil { - return nil, fmt.Errorf("error enabling thread: %s err: %v", t, err) + return nil, fmt.Errorf("error enabling thread: %s err: %w", t, err) } } log.Infof("Enable successful.") diff --git a/runsc/cmd/mitigate_extras.go b/runsc/cmd/mitigate_extras.go new file mode 100644 index 000000000..2cb2833f0 --- /dev/null +++ b/runsc/cmd/mitigate_extras.go @@ -0,0 +1,33 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/mitigate" +) + +// usage returns any extra bits of the usage string. +func (m *Mitigate) usage() string { + return "" +} + +// setFlags sets extra flags for the command Mitigate. +func (m *Mitigate) setFlags(f *flag.FlagSet) {} + +// postMitigate handles any postMitigate actions. +func (m *Mitigate) postMitigate(_ mitigate.CPUSet) error { + return nil +} diff --git a/runsc/cmd/mitigate_test.go b/runsc/cmd/mitigate_test.go index 54211ce32..2d3fef7c1 100644 --- a/runsc/cmd/mitigate_test.go +++ b/runsc/cmd/mitigate_test.go @@ -23,7 +23,6 @@ import ( "strings" "testing" - "gvisor.dev/gvisor/runsc/mitigate" "gvisor.dev/gvisor/runsc/mitigate/mock" ) @@ -86,9 +85,6 @@ power management::84 t.Run(tc.name, func(t *testing.T) { m := &Mitigate{ dryRun: true, - vulnerable: func(other mitigate.Thread) bool { - return other.IsVulnerable() - }, } m.doExecuteTest(t, "Mitigate", tc.mitigateData, tc.mitigateCPU, tc.mitigateError) @@ -106,9 +102,6 @@ func TestExecuteSmoke(t *testing.T) { m := &Mitigate{ dryRun: true, - vulnerable: func(other mitigate.Thread) bool { - return other.IsVulnerable() - }, } m.doExecuteTest(t, "Mitigate", string(smokeMitigate), 0, nil) diff --git a/runsc/cmd/symbolize.go b/runsc/cmd/symbolize.go index fc0c69358..0fa4bfda1 100644 --- a/runsc/cmd/symbolize.go +++ b/runsc/cmd/symbolize.go @@ -65,13 +65,15 @@ func (c *Symbolize) Execute(_ context.Context, f *flag.FlagSet, args ...interfac f.Usage() return subcommands.ExitUsageError } - if !coverage.KcovAvailable() { + if !coverage.Available() { return Errorf("symbolize can only be used when coverage is available.") } coverage.InitCoverageData() if c.dumpAll { - coverage.WriteAllBlocks(os.Stdout) + if err := coverage.WriteAllBlocks(os.Stdout); err != nil { + return Errorf("Failed to write out blocks: %v", err) + } return subcommands.ExitSuccess } diff --git a/runsc/cmd/verity_prepare.go b/runsc/cmd/verity_prepare.go new file mode 100644 index 000000000..66128b2a3 --- /dev/null +++ b/runsc/cmd/verity_prepare.go @@ -0,0 +1,108 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + "math/rand" + "os" + + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +// VerityPrepare implements subcommands.Commands for the "verity-prepare" +// command. It sets up a sandbox with a writable verity mount mapped to "--dir", +// and executes the verity measure tool specified by "--tool" in the sandbox. It +// is intended to prepare --dir to be mounted as a verity filesystem. +type VerityPrepare struct { + root string + tool string + dir string +} + +// Name implements subcommands.Command.Name. +func (*VerityPrepare) Name() string { + return "verity-prepare" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*VerityPrepare) Synopsis() string { + return "Generates the data structures necessary to enable verityfs on a filesystem." +} + +// Usage implements subcommands.Command.Usage. +func (*VerityPrepare) Usage() string { + return "verity-prepare --tool=<measure_tool> --dir=<path>" +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *VerityPrepare) SetFlags(f *flag.FlagSet) { + f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`) + f.StringVar(&c.tool, "tool", "", "path to the verity measure_tool") + f.StringVar(&c.dir, "dir", "", "path to the directory to be hashed") +} + +// Execute implements subcommands.Command.Execute. +func (c *VerityPrepare) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + conf := args[0].(*config.Config) + waitStatus := args[1].(*unix.WaitStatus) + + hostname, err := os.Hostname() + if err != nil { + return Errorf("Error to retrieve hostname: %v", err) + } + + // Map the entire host file system. + absRoot, err := resolvePath(c.root) + if err != nil { + return Errorf("Error resolving root: %v", err) + } + + spec := &specs.Spec{ + Root: &specs.Root{ + Path: absRoot, + }, + Process: &specs.Process{ + Cwd: absRoot, + Args: []string{c.tool, "--path", "/verityroot"}, + Env: os.Environ(), + Capabilities: specutils.AllCapabilities(), + }, + Hostname: hostname, + Mounts: []specs.Mount{ + specs.Mount{ + Source: c.dir, + Destination: "/verityroot", + Type: "bind", + Options: []string{"verity.roothash="}, + }, + }, + } + + cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000)) + + // Force no networking, it is not necessary to run the verity measure tool. + conf.Network = config.NetworkNone + + conf.Verity = true + + return startContainerAndWait(spec, conf, cid, waitStatus) +} diff --git a/runsc/config/config.go b/runsc/config/config.go index 1e5858837..fa550ebf7 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -55,6 +55,9 @@ type Config struct { // PanicLog is the path to log GO's runtime messages, if not empty. PanicLog string `flag:"panic-log"` + // CoverageReport is the path to write Go coverage information, if not empty. + CoverageReport string `flag:"coverage-report"` + // DebugLogFormat is the log format for debug. DebugLogFormat string `flag:"debug-log-format"` @@ -172,6 +175,9 @@ type Config struct { // Enables seccomp inside the sandbox. OCISeccomp bool `flag:"oci-seccomp"` + // Mounts the cgroup filesystem backed by the sentry's cgroupfs. + Cgroupfs bool `flag:"cgroupfs"` + // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in // tests. It allows runsc to start the sandbox process as the current // user, and without chrooting the sandbox process. This can be diff --git a/runsc/config/flags.go b/runsc/config/flags.go index 1d996c841..c3dca2352 100644 --- a/runsc/config/flags.go +++ b/runsc/config/flags.go @@ -44,7 +44,8 @@ func RegisterFlags() { // Debugging flags. flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") - flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.") + flag.String("panic-log", "", "file path where panic reports and other Go's runtime messages are written.") + flag.String("coverage-report", "", "file path where Go coverage reports are written. Reports will only be generated if runsc is built with --collect_code_coverage and --instrumentation_filter Bazel flags.") flag.Bool("log-packets", false, "enable network packet logging.") flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.") flag.Bool("alsologtostderr", false, "send log messages to stderr.") @@ -75,6 +76,7 @@ func RegisterFlags() { flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") flag.Bool("vfs2", false, "enables VFSv2. This uses the new VFS layer that is faster than the previous one.") flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") + flag.Bool("cgroupfs", false, "Automatically mount cgroupfs.") // Flags that control sandbox runtime behavior: network related. flag.Var(networkTypePtr(NetworkSandbox), "network", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 3620dc8c3..5314549d6 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -51,9 +51,7 @@ go_test( ], library = ":container", shard_count = more_shards, - tags = [ - "requires-kvm", - ], + tags = ["requires-kvm"], deps = [ "//pkg/abi/linux", "//pkg/bits", diff --git a/runsc/container/container.go b/runsc/container/container.go index f9d83c118..0820edaec 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -233,7 +233,7 @@ func New(conf *config.Config, args Args) (*Container, error) { } // Create and join cgroup before processes are created to ensure they are // part of the cgroup from the start (and all their children processes). - cg, err := cgroup.New(args.Spec) + cg, err := cgroup.NewFromSpec(args.Spec) if err != nil { return nil, err } @@ -886,7 +886,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu // Add root mount and then add any other additional mounts. mountCount := 1 for _, m := range spec.Mounts { - if specutils.Is9PMount(m) { + if specutils.Is9PMount(m, conf.VFS2) { mountCount++ } } @@ -1132,7 +1132,7 @@ func (c *Container) populateStats(event *boot.EventOut) { // account for the full cgroup CPU usage. We split cgroup usage // proportionally according to the sentry-internal usage measurements, // only counting Running containers. - log.Warningf("event.ContainerUsage: %v", event.ContainerUsage) + log.Debugf("event.ContainerUsage: %v", event.ContainerUsage) var containerUsage uint64 var allContainersUsage uint64 for ID, usage := range event.ContainerUsage { @@ -1142,7 +1142,7 @@ func (c *Container) populateStats(event *boot.EventOut) { } } - cgroup, err := c.Sandbox.FindCgroup() + cgroup, err := c.Sandbox.NewCGroup() if err != nil { // No cgroup, so rely purely on the sentry's accounting. log.Warningf("events: no cgroups") @@ -1159,17 +1159,18 @@ func (c *Container) populateStats(event *boot.EventOut) { return } - // If the sentry reports no memory usage, fall back on cgroups and - // split usage equally across containers. + // If the sentry reports no CPU usage, fall back on cgroups and split usage + // equally across containers. if allContainersUsage == 0 { log.Warningf("events: no sentry CPU usage reported") allContainersUsage = cgroupsUsage containerUsage = cgroupsUsage / uint64(len(event.ContainerUsage)) } - log.Warningf("%f, %f, %f", containerUsage, cgroupsUsage, allContainersUsage) // Scaling can easily overflow a uint64 (e.g. a containerUsage and // cgroupsUsage of 16 seconds each will overflow), so use floats. - event.Event.Data.CPU.Usage.Total = uint64(float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage))) + total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage)) + log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total) + event.Event.Data.CPU.Usage.Total = uint64(total) return } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 5a0c468a4..0e79877b7 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -2449,6 +2449,27 @@ func TestCreateWithCorruptedStateFile(t *testing.T) { } } +func TestBindMountByOption(t *testing.T) { + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "bind-mount") + spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) + if err != nil { + t.Fatalf("ioutil.TempDir(): %v", err) + } + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: dir, + Type: "none", + Options: []string{"rw", "bind"}, + }) + if err := run(spec, conf); err != nil { + t.Fatalf("error running sandbox: %v", err) + } + }) + } +} + func execute(cont *Container, name string, arg ...string) (unix.WaitStatus, error) { args := &control.ExecArgs{ Filename: name, diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 0f0a223ce..0dbe1e323 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -25,6 +25,7 @@ import ( "testing" "time" + "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" @@ -1510,7 +1511,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { Destination: "/mydir/test", Source: "/some/dir", Type: "tmpfs", - Options: []string{"rw", "rbind", "relatime"}, + Options: []string{"rw", "relatime"}, } podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) @@ -1917,9 +1918,9 @@ func TestMultiContainerEvent(t *testing.T) { } defer cleanup() - for _, cont := range containers { - t.Logf("Running containerd %s", cont.ID) - } + t.Logf("Running container sleep %s", containers[0].ID) + t.Logf("Running container busy %s", containers[1].ID) + t.Logf("Running container quick %s", containers[2].ID) // Wait for last container to stabilize the process count that is // checked further below. @@ -1940,50 +1941,61 @@ func TestMultiContainerEvent(t *testing.T) { } // Check events for running containers. - var prevUsage uint64 for _, cont := range containers[:2] { ret, err := cont.Event() if err != nil { - t.Errorf("Container.Events(): %v", err) + t.Errorf("Container.Event(%q): %v", cont.ID, err) } evt := ret.Event if want := "stats"; evt.Type != want { - t.Errorf("Wrong event type, want: %s, got: %s", want, evt.Type) + t.Errorf("Wrong event type, cid: %q, want: %s, got: %s", cont.ID, want, evt.Type) } if cont.ID != evt.ID { t.Errorf("Wrong container ID, want: %s, got: %s", cont.ID, evt.ID) } // One process per remaining container. if got, want := evt.Data.Pids.Current, uint64(2); got != want { - t.Errorf("Wrong number of PIDs, want: %d, got: %d", want, got) + t.Errorf("Wrong number of PIDs, cid: %q, want: %d, got: %d", cont.ID, want, got) } - // Both remaining containers should have nonzero usage, and - // 'busy' should have higher usage than 'sleep'. - usage := evt.Data.CPU.Usage.Total - if usage == 0 { - t.Errorf("Running container should report nonzero CPU usage, but got %d", usage) + // The exited container should always have a usage of zero. + if exited := ret.ContainerUsage[containers[2].ID]; exited != 0 { + t.Errorf("Exited container should report 0 CPU usage, got: %d", exited) + } + } + + // Check that CPU reported by busy container is higher than sleep. + cb := func() error { + sleepEvt, err := containers[0].Event() + if err != nil { + return &backoff.PermanentError{Err: err} } - if usage <= prevUsage { - t.Errorf("Expected container %s to use more than %d ns of CPU, but used %d", cont.ID, prevUsage, usage) + sleepUsage := sleepEvt.Event.Data.CPU.Usage.Total + + busyEvt, err := containers[1].Event() + if err != nil { + return &backoff.PermanentError{Err: err} } - t.Logf("Container %s usage: %d", cont.ID, usage) - prevUsage = usage + busyUsage := busyEvt.Event.Data.CPU.Usage.Total - // The exited container should have a usage of zero. - if exited := ret.ContainerUsage[containers[2].ID]; exited != 0 { - t.Errorf("Exited container should report 0 CPU usage, but got %d", exited) + if busyUsage <= sleepUsage { + t.Logf("Busy container usage lower than sleep (busy: %d, sleep: %d), retrying...", busyUsage, sleepUsage) + return fmt.Errorf("Busy container should have higher usage than sleep, busy: %d, sleep: %d", busyUsage, sleepUsage) } + return nil + } + // Give time for busy container to run and use more CPU than sleep. + if err := testutil.Poll(cb, 10*time.Second); err != nil { + t.Fatal(err) } - // Check that stop and destroyed containers return error. + // Check that stopped and destroyed containers return error. if err := containers[1].Destroy(); err != nil { t.Fatalf("container.Destroy: %v", err) } for _, cont := range containers[1:] { - _, err := cont.Event() - if err == nil { - t.Errorf("Container.Events() should have failed, cid:%s, state: %v", cont.ID, cont.Status) + if _, err := cont.Event(); err == nil { + t.Errorf("Container.Event() should have failed, cid: %q, state: %v", cont.ID, cont.Status) } } } diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index e04ddda47..b81ede5ae 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -21,6 +21,7 @@ package fsgofer import ( + "errors" "fmt" "io" "math" @@ -58,9 +59,6 @@ var verityXattrs = map[string]struct{}{ // join is equivalent to path.Join() but skips path.Clean() which is expensive. func join(parent, child string) string { - if child == "." || child == ".." { - panic(fmt.Sprintf("invalid child path %q", child)) - } return parent + "/" + child } @@ -1226,3 +1224,56 @@ func (l *localFile) checkROMount() error { } return nil } + +func (l *localFile) MultiGetAttr(names []string) ([]p9.FullStat, error) { + stats := make([]p9.FullStat, 0, len(names)) + + if len(names) > 0 && names[0] == "" { + qid, valid, attr, err := l.GetAttr(p9.AttrMask{}) + if err != nil { + return nil, err + } + stats = append(stats, p9.FullStat{ + QID: qid, + Valid: valid, + Attr: attr, + }) + names = names[1:] + } + + parent := l.file.FD() + for _, name := range names { + child, err := unix.Openat(parent, name, openFlags|unix.O_PATH, 0) + if parent != l.file.FD() { + // Parent is no longer needed. + _ = unix.Close(parent) + } + if err != nil { + if errors.Is(err, unix.ENOENT) { + // No pont in continuing any further. + break + } + return nil, err + } + + var stat unix.Stat_t + if err := unix.Fstat(child, &stat); err != nil { + _ = unix.Close(child) + return nil, err + } + valid, attr := l.fillAttr(&stat) + stats = append(stats, p9.FullStat{ + QID: l.attachPoint.makeQID(&stat), + Valid: valid, + Attr: attr, + }) + if (stat.Mode & unix.S_IFMT) != unix.S_IFDIR { + // Doesn't need to continue if entry is not a dir. Including symlinks + // that cannot be followed. + _ = unix.Close(child) + break + } + parent = child + } + return stats, nil +} diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index d7e141476..77723827a 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -703,16 +703,6 @@ func TestWalkNotFound(t *testing.T) { }) } -func TestWalkPanic(t *testing.T) { - runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { - for _, name := range []string{".", ".."} { - assertPanic(t, func() { - s.file.Walk([]string{name}) - }) - } - }) -} - func TestWalkDup(t *testing.T) { runAll(t, func(t *testing.T, s state) { _, dup, err := s.file.Walk([]string{}) diff --git a/runsc/mitigate/mitigate.go b/runsc/mitigate/mitigate.go index 24f67414c..88409af8f 100644 --- a/runsc/mitigate/mitigate.go +++ b/runsc/mitigate/mitigate.go @@ -50,7 +50,7 @@ const ( type CPUSet map[threadID]*ThreadGroup // NewCPUSet creates a CPUSet from data read from /proc/cpuinfo. -func NewCPUSet(data []byte, vulnerable func(Thread) bool) (CPUSet, error) { +func NewCPUSet(data []byte) (CPUSet, error) { processors, err := getThreads(string(data)) if err != nil { return nil, err @@ -67,7 +67,7 @@ func NewCPUSet(data []byte, vulnerable func(Thread) bool) (CPUSet, error) { core = &ThreadGroup{} set[p.id] = core } - core.isVulnerable = core.isVulnerable || vulnerable(p) + core.isVulnerable = core.isVulnerable || p.IsVulnerable() core.threads = append(core.threads, p) } @@ -446,6 +446,7 @@ func buildRegex(key, match string) *regexp.Regexp { func parseRegex(data, key, match string) (string, error) { r := buildRegex(key, match) matches := r.FindStringSubmatch(data) + if len(matches) < 2 { return "", fmt.Errorf("failed to match key %q: %q", key, data) } diff --git a/runsc/mitigate/mitigate_test.go b/runsc/mitigate/mitigate_test.go index bd5a2433f..890c65f05 100644 --- a/runsc/mitigate/mitigate_test.go +++ b/runsc/mitigate/mitigate_test.go @@ -54,14 +54,13 @@ func TestMockCPUSet(t *testing.T) { } { t.Run(tc.testCase.Name, func(t *testing.T) { data := tc.testCase.MakeCPUString() - vulnerable := func(t Thread) bool { - return t.IsVulnerable() - } - set, err := NewCPUSet([]byte(data), vulnerable) + set, err := NewCPUSet([]byte(data)) if err != nil { t.Fatalf("Failed to create cpuSet: %v", err) } + t.Logf("data: %s", data) + for _, tg := range set { if err := checkSorted(tg.threads); err != nil { t.Fatalf("Failed to sort cpuSet: %v", err) @@ -260,11 +259,7 @@ func TestReadFile(t *testing.T) { t.Fatalf("Failed to read cpuinfo: %v", err) } - vulnerable := func(t Thread) bool { - return t.IsVulnerable() - } - - set, err := NewCPUSet(data, vulnerable) + set, err := NewCPUSet(data) if err != nil { t.Fatalf("Failed to parse CPU data %v\n%s", err, data) } diff --git a/runsc/mitigate/mock/mock.go b/runsc/mitigate/mock/mock.go index 2db718cb9..12c59e356 100644 --- a/runsc/mitigate/mock/mock.go +++ b/runsc/mitigate/mock/mock.go @@ -82,6 +82,19 @@ var Haswell2core = CPU{ ThreadsPerCore: 1, } +// AMD2 is an two core AMD machine. +var AMD2 = CPU{ + Name: "AMD", + VendorID: "AuthenticAMD", + Family: 23, + Model: 49, + ModelName: "AMD EPYC 7B12", + Bugs: "sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass", + PhysicalCores: 1, + Cores: 1, + ThreadsPerCore: 2, +} + // AMD8 is an eight core AMD machine. var AMD8 = CPU{ Name: "AMD", @@ -115,15 +128,15 @@ bugs : %s for k := 0; k < tc.ThreadsPerCore; k++ { processorNum := (i*tc.Cores+j)*tc.ThreadsPerCore + k ret += fmt.Sprintf(template, - processorNum, /*processor*/ - tc.VendorID, /*vendor_id*/ - tc.Family, /*cpu family*/ - tc.Model, /*model*/ - tc.ModelName, /*model name*/ - i, /*physical id*/ - j, /*core id*/ - tc.Cores*tc.PhysicalCores, /*cpu cores*/ - tc.Bugs, /*bugs*/ + processorNum, /*processor*/ + tc.VendorID, /*vendor_id*/ + tc.Family, /*cpu family*/ + tc.Model, /*model*/ + tc.ModelName, /*model name*/ + i, /*physical id*/ + j, /*core id*/ + k, /*cpu cores*/ + tc.Bugs, /*bugs*/ ) } } diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index f0a551a1e..bc4a3fa32 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/cleanup", "//pkg/control/client", "//pkg/control/server", + "//pkg/coverage", "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 450f92645..8d31e33b2 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -34,6 +34,7 @@ import ( "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/control/client" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/platform" @@ -309,20 +310,9 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { return pl, nil } -// FindCgroup returns the sandbox's Cgroup, or an error if it does not have one. -func (s *Sandbox) FindCgroup() (*cgroup.Cgroup, error) { - paths, err := cgroup.LoadPaths(strconv.Itoa(s.Pid)) - if err != nil { - return nil, err - } - // runsc places sandboxes in the same cgroup for each controller, so we - // pick an arbitrary controller here to get the cgroup path. - const controller = "cpuacct" - controllerPath, ok := paths[controller] - if !ok { - return nil, fmt.Errorf("no %q controller found", controller) - } - return cgroup.NewFromPath(controllerPath) +// NewCGroup returns the sandbox's Cgroup, or an error if it does not have one. +func (s *Sandbox) NewCGroup() (*cgroup.Cgroup, error) { + return cgroup.NewFromPid(s.Pid) } // Execute runs the specified command in the container. It returns the PID of @@ -399,15 +389,15 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD)) nextFD++ } - if conf.DebugLog != "" { - test := "" - if len(conf.TestOnlyTestNameEnv) != 0 { - // Fetch test name if one is provided and the test only flag was set. - if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { - test = t - } - } + test := "" + if len(conf.TestOnlyTestNameEnv) != 0 { + // Fetch test name if one is provided and the test only flag was set. + if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { + test = t + } + } + if conf.DebugLog != "" { debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot", test) if err != nil { return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err) @@ -418,23 +408,29 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn nextFD++ } if conf.PanicLog != "" { - test := "" - if len(conf.TestOnlyTestNameEnv) != 0 { - // Fetch test name if one is provided and the test only flag was set. - if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { - test = t - } - } - panicLogFile, err := specutils.DebugLogFile(conf.PanicLog, "panic", test) if err != nil { - return fmt.Errorf("opening debug log file in %q: %v", conf.PanicLog, err) + return fmt.Errorf("opening panic log file in %q: %v", conf.PanicLog, err) } defer panicLogFile.Close() cmd.ExtraFiles = append(cmd.ExtraFiles, panicLogFile) cmd.Args = append(cmd.Args, "--panic-log-fd="+strconv.Itoa(nextFD)) nextFD++ } + covFilename := conf.CoverageReport + if covFilename == "" { + covFilename = os.Getenv("GO_COVERAGE_FILE") + } + if covFilename != "" && coverage.Available() { + covFile, err := specutils.DebugLogFile(covFilename, "cov", test) + if err != nil { + return fmt.Errorf("opening debug log file in %q: %v", covFilename, err) + } + defer covFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, covFile) + cmd.Args = append(cmd.Args, "--coverage-fd="+strconv.Itoa(nextFD)) + nextFD++ + } // Add the "boot" command to the args. // @@ -486,7 +482,7 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn } if deviceFile, err := gPlatform.OpenDevice(); err != nil { - return fmt.Errorf("opening device file for platform %q: %v", gPlatform, err) + return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err) } else if deviceFile != nil { defer deviceFile.Close() cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile) @@ -1174,7 +1170,7 @@ func deviceFileForPlatform(name string) (*os.File, error) { f, err := p.OpenDevice() if err != nil { - return nil, fmt.Errorf("opening device file for platform %q: %v", p, err) + return nil, fmt.Errorf("opening device file for platform %q: %w", name, err) } return f, nil } diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go index b62504a8c..9ecd0fde6 100644 --- a/runsc/specutils/fs.go +++ b/runsc/specutils/fs.go @@ -18,6 +18,7 @@ import ( "fmt" "math/bits" "path" + "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" @@ -64,6 +65,12 @@ var optionsMap = map[string]mapping{ "sync": {set: true, val: unix.MS_SYNCHRONOUS}, } +// verityMountOptions is the set of valid verity mount option keys. +var verityMountOptions = map[string]struct{}{ + "verity.roothash": struct{}{}, + "verity.action": struct{}{}, +} + // propOptionsMap is similar to optionsMap, but it lists propagation options // that cannot be used together with other flags. var propOptionsMap = map[string]mapping{ @@ -117,6 +124,14 @@ func validateMount(mnt *specs.Mount) error { return nil } +func moptKey(opt string) string { + if len(opt) == 0 { + return opt + } + // Guaranteed to have at least one token, since opt is not empty. + return strings.SplitN(opt, "=", 2)[0] +} + // ValidateMountOptions validates that mount options are correct. func ValidateMountOptions(opts []string) error { for _, o := range opts { @@ -125,7 +140,8 @@ func ValidateMountOptions(opts []string) error { } _, ok1 := optionsMap[o] _, ok2 := propOptionsMap[o] - if !ok1 && !ok2 { + _, ok3 := verityMountOptions[moptKey(o)] + if !ok1 && !ok2 && !ok3 { return fmt.Errorf("unknown mount option %q", o) } if err := validatePropagation(o); err != nil { diff --git a/runsc/specutils/seccomp/BUILD b/runsc/specutils/seccomp/BUILD index e9e647d82..c5f5b863e 100644 --- a/runsc/specutils/seccomp/BUILD +++ b/runsc/specutils/seccomp/BUILD @@ -28,8 +28,10 @@ go_test( srcs = ["seccomp_test.go"], library = ":seccomp", deps = [ - "//pkg/binary", + "//pkg/abi/linux", "//pkg/bpf", + "//pkg/hostarch", + "//pkg/marshal", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/runsc/specutils/seccomp/seccomp_test.go b/runsc/specutils/seccomp/seccomp_test.go index 11a6c8daa..20796bf14 100644 --- a/runsc/specutils/seccomp/seccomp_test.go +++ b/runsc/specutils/seccomp/seccomp_test.go @@ -20,20 +20,15 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/marshal" ) -type seccompData struct { - nr uint32 - arch uint32 - instructionPointer uint64 - args [6]uint64 -} - -// asInput converts a seccompData to a bpf.Input. -func asInput(d seccompData) bpf.Input { - return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian} +// asInput converts a linux.SeccompData to a bpf.Input. +func asInput(d *linux.SeccompData) bpf.Input { + return bpf.InputBytes{marshal.Marshal(d), hostarch.ByteOrder} } // testInput creates an Input struct with given seccomp input values. @@ -49,13 +44,13 @@ func testInput(arch uint32, syscallName string, args *[6]uint64) bpf.Input { args = &argArray } - data := seccompData{ - nr: syscallNo, - arch: arch, - args: *args, + data := linux.SeccompData{ + Nr: int32(syscallNo), + Arch: arch, + Args: *args, } - return asInput(data) + return asInput(&data) } // testCase holds a seccomp test case. @@ -100,7 +95,7 @@ var ( }, // Syscall matches but the arch is AUDIT_ARCH_X86 so the return // value is the bad arch action. - input: asInput(seccompData{nr: 183, arch: 0x40000003}), // + input: asInput(&linux.SeccompData{Nr: 183, Arch: 0x40000003}), // expected: uint32(killThreadAction), }, { diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 45856fd58..11b476690 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -332,14 +332,38 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth. return auth.CapabilitySetOfMany(caps), nil } -// Is9PMount returns true if the given mount can be mounted as an external gofer. -func Is9PMount(m specs.Mount) bool { - return m.Type == "bind" && m.Source != "" && IsVFS1SupportedDevMount(m) +// Is9PMount returns true if the given mount can be mounted as an external +// gofer. +func Is9PMount(m specs.Mount, vfs2Enabled bool) bool { + MaybeConvertToBindMount(&m) + return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m, vfs2Enabled) } -// IsVFS1SupportedDevMount returns true if m.Destination does not specify a +// MaybeConvertToBindMount converts mount type to "bind" in case any of the +// mount options are either "bind" or "rbind" as required by the OCI spec. +// +// "For bind mounts (when options include either bind or rbind), the type is a +// dummy, often "none" (not listed in /proc/filesystems)." +func MaybeConvertToBindMount(m *specs.Mount) { + if m.Type == "bind" { + return + } + for _, opt := range m.Options { + if opt == "bind" || opt == "rbind" { + m.Type = "bind" + return + } + } +} + +// IsSupportedDevMount returns true if m.Destination does not specify a // path that is hardcoded by VFS1's implementation of /dev. -func IsVFS1SupportedDevMount(m specs.Mount) bool { +func IsSupportedDevMount(m specs.Mount, vfs2Enabled bool) bool { + // VFS2 has no hardcoded files under /dev, so everything is allowed. + if vfs2Enabled { + return true + } + // See pkg/sentry/fs/dev/dev.go. var existingDevices = []string{ "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr", |