diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/boot/BUILD | 2 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 21 | ||||
-rw-r--r-- | runsc/boot/loader.go | 8 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 2 | ||||
-rw-r--r-- | runsc/cgroup/BUILD | 1 | ||||
-rw-r--r-- | runsc/cgroup/cgroup.go | 99 | ||||
-rw-r--r-- | runsc/cgroup/cgroup_test.go | 36 | ||||
-rw-r--r-- | runsc/container/BUILD | 2 | ||||
-rw-r--r-- | runsc/container/container.go | 136 |
9 files changed, 237 insertions, 70 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index ff7a5a44b..36806b740 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -80,7 +80,6 @@ go_library( "//pkg/sentry/loader", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/sighandling", "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/netfilter", "//pkg/sentry/socket/netlink", @@ -96,6 +95,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/watchdog", + "//pkg/sighandling", "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/link/ethernet", diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 703f34827..db363435b 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -304,6 +304,22 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */ }, }, + unix.SYS_TIMER_CREATE: []seccomp.Rule{ + { + seccomp.EqualTo(unix.CLOCK_THREAD_CPUTIME_ID), /* which */ + seccomp.MatchAny{}, /* sevp */ + seccomp.MatchAny{}, /* timerid */ + }, + }, + unix.SYS_TIMER_DELETE: []seccomp.Rule{}, + unix.SYS_TIMER_SETTIME: []seccomp.Rule{ + { + seccomp.MatchAny{}, /* timerid */ + seccomp.EqualTo(0), /* flags */ + seccomp.MatchAny{}, /* new_value */ + seccomp.EqualTo(0), /* old_value */ + }, + }, unix.SYS_TGKILL: []seccomp.Rule{ { seccomp.EqualTo(uint64(os.Getpid())), @@ -630,6 +646,11 @@ func hostInetFilters() seccomp.SyscallRules { func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ + unix.SYS_ACCEPT4: []seccomp.Rule{ + { + seccomp.EqualTo(fd), + }, + }, unix.SYS_ACCEPT: []seccomp.Rule{ { seccomp.EqualTo(fd), diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index b46d84e5a..2f2d4df5e 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -49,13 +49,13 @@ import ( "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/sighandling" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" @@ -241,10 +241,8 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true - if args.Conf.FUSE { - kernel.FUSEEnabled = true - } - + kernel.FUSEEnabled = args.Conf.FUSE + kernel.LISAFSEnabled = args.Conf.Lisafs vfs2.Override() } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 2f1332566..ac1e5ac37 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -173,7 +173,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create rootProcArgs.Credentials = rootCreds rootProcArgs.Umask = 0022 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals - rootCtx := procArgs.NewContext(c.k) + rootCtx := rootProcArgs.NewContext(c.k) mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD index f7e892584..d3aec1fff 100644 --- a/runsc/cgroup/BUILD +++ b/runsc/cgroup/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/cleanup", "//pkg/log", + "//pkg/sync", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index 5dbf14376..7a0f0694f 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -19,7 +19,6 @@ package cgroup import ( "bufio" "context" - "errors" "fmt" "io" "io/ioutil" @@ -34,6 +33,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" ) const ( @@ -104,17 +104,21 @@ func setOptionalValueUint16(path, name string, val *uint16) error { func setValue(path, name, data string) error { fullpath := filepath.Join(path, name) + log.Debugf("Setting %q to %q", fullpath, data) + return writeFile(fullpath, []byte(data), 0700) +} - // Retry writes on EINTR; see: - // https://github.com/golang/go/issues/38033 - for { - err := ioutil.WriteFile(fullpath, []byte(data), 0700) - if err == nil { - return nil - } else if !errors.Is(err, unix.EINTR) { - return err - } +// writeFile is similar to ioutil.WriteFile() but doesn't create the file if it +// doesn't exist. +func writeFile(path string, data []byte, perm os.FileMode) error { + f, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, perm) + if err != nil { + return err } + defer f.Close() + + _, err = f.Write(data) + return err } func getValue(path, name string) (string, error) { @@ -155,15 +159,8 @@ func fillFromAncestor(path string) (string, error) { return "", err } - // Retry writes on EINTR; see: - // https://github.com/golang/go/issues/38033 - for { - err := ioutil.WriteFile(path, []byte(val), 0700) - if err == nil { - break - } else if !errors.Is(err, unix.EINTR) { - return "", err - } + if err := writeFile(path, []byte(val), 0700); err != nil { + return "", nil } return val, nil } @@ -309,7 +306,13 @@ func NewFromSpec(spec *specs.Spec) (*Cgroup, error) { if spec.Linux == nil || spec.Linux.CgroupsPath == "" { return nil, nil } - return new("self", spec.Linux.CgroupsPath) + return NewFromPath(spec.Linux.CgroupsPath) +} + +// NewFromPath creates a new Cgroup instance from the specified relative path. +// Cgroup paths are loaded based on the current process. +func NewFromPath(cgroupsPath string) (*Cgroup, error) { + return new("self", cgroupsPath) } // NewFromPid loads cgroup for the given process. @@ -365,21 +368,20 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { } for _, key := range missing { ctrlr := controllers[key] - path := c.MakePath(key) - log.Debugf("Creating cgroup %q: %q", key, path) - if err := os.MkdirAll(path, 0755); err != nil { - if ctrlr.optional() && errors.Is(err, unix.EROFS) { - if err := ctrlr.skip(res); err != nil { - return err - } - log.Infof("Skipping cgroup %q", key) - continue + + if skip, err := c.createController(key); skip && ctrlr.optional() { + if err := ctrlr.skip(res); err != nil { + return err } + log.Infof("Skipping cgroup %q, err: %v", key, err) + continue + } else if err != nil { return err } // Only set controllers that were created by me. c.Own[key] = true + path := c.MakePath(key) if err := ctrlr.set(res, path); err != nil { return err } @@ -388,10 +390,29 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { return nil } +// createController creates the controller directory, checking that the +// controller is enabled in the system. It returns a boolean indicating whether +// the controller should be skipped (e.g. controller is disabled). In case it +// should be skipped, it also returns the error it got. +func (c *Cgroup) createController(name string) (bool, error) { + ctrlrPath := filepath.Join(cgroupRoot, name) + if _, err := os.Stat(ctrlrPath); err != nil { + return os.IsNotExist(err), err + } + + path := c.MakePath(name) + log.Debugf("Creating cgroup %q: %q", name, path) + if err := os.MkdirAll(path, 0755); err != nil { + return false, err + } + return false, nil +} + // Uninstall removes the settings done in Install(). If cgroup path already // existed when Install() was called, Uninstall is a noop. func (c *Cgroup) Uninstall() error { log.Debugf("Deleting cgroup %q", c.Name) + wait := sync.WaitGroupErr{} for key := range controllers { if !c.Own[key] { // cgroup is managed by caller, don't touch it. @@ -400,9 +421,8 @@ func (c *Cgroup) Uninstall() error { path := c.MakePath(key) log.Debugf("Removing cgroup controller for key=%q path=%q", key, path) - // If we try to remove the cgroup too soon after killing the - // sandbox we might get EBUSY, so we retry for a few seconds - // until it succeeds. + // If we try to remove the cgroup too soon after killing the sandbox we + // might get EBUSY, so we retry for a few seconds until it succeeds. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) @@ -413,11 +433,18 @@ func (c *Cgroup) Uninstall() error { } return err } - if err := backoff.Retry(fn, b); err != nil { - return fmt.Errorf("removing cgroup path %q: %w", path, err) - } + // Run deletions in parallel to remove all directories even if there are + // failures/timeouts in other directories. + wait.Add(1) + go func() { + defer wait.Done() + if err := backoff.Retry(fn, b); err != nil { + wait.ReportError(fmt.Errorf("removing cgroup path %q: %w", path, err)) + return + } + }() } - return nil + return wait.Error() } // Join adds the current process to the all controllers. Returns function that diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index 1431b4e8f..0b6a5431b 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -129,6 +129,18 @@ func boolPtr(v bool) *bool { return &v } +func createDir(dir string, contents map[string]string) error { + for name := range contents { + path := filepath.Join(dir, name) + f, err := os.Create(path) + if err != nil { + return err + } + f.Close() + } + return nil +} + func checkDir(t *testing.T, dir string, contents map[string]string) { all, err := ioutil.ReadDir(dir) if err != nil { @@ -254,6 +266,9 @@ func TestBlockIO(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ BlockIO: tc.spec, @@ -304,6 +319,9 @@ func TestCPU(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ CPU: tc.spec, @@ -343,6 +361,9 @@ func TestCPUSet(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ CPU: tc.spec, @@ -481,6 +502,9 @@ func TestHugeTlb(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ HugepageLimits: tc.spec, @@ -542,6 +566,9 @@ func TestMemory(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ Memory: tc.spec, @@ -584,6 +611,9 @@ func TestNetworkClass(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ Network: tc.spec, @@ -631,6 +661,9 @@ func TestNetworkPriority(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ Network: tc.spec, @@ -671,6 +704,9 @@ func TestPids(t *testing.T) { t.Fatalf("error creating temporary directory: %v", err) } defer os.RemoveAll(dir) + if err := createDir(dir, tc.wants); err != nil { + t.Fatalf("createDir(): %v", err) + } spec := &specs.LinuxResources{ Pids: tc.spec, diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 5314549d6..4e744e604 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -19,7 +19,7 @@ go_library( "//pkg/cleanup", "//pkg/log", "//pkg/sentry/control", - "//pkg/sentry/sighandling", + "//pkg/sighandling", "//pkg/sync", "//runsc/boot", "//runsc/cgroup", diff --git a/runsc/container/container.go b/runsc/container/container.go index 9c0004753..6a59df411 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -35,7 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/sighandling" + "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" @@ -44,6 +44,8 @@ import ( "gvisor.dev/gvisor/runsc/specutils" ) +const cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" + // validateID validates the container id. func validateID(id string) error { // See libcontainer/factory_linux.go. @@ -113,6 +115,16 @@ type Container struct { // container is created and reset when the sandbox is destroyed. Sandbox *sandbox.Sandbox `json:"sandbox"` + // CompatCgroup has the cgroup configuration for the container. For the single + // container case, container cgroup is set in `c.Sandbox` only. CompactCgroup + // is only set for multi-container, where the `c.Sandbox` cgroup represents + // the entire pod. + // + // Note that CompatCgroup is created only for compatibility with tools + // that expect container cgroups to exist. Setting limits here makes no change + // to the container in question. + CompatCgroup *cgroup.Cgroup `json:"compatCgroup"` + // Saver handles load from/save to the state file safely from multiple // processes. Saver StateFile `json:"saver"` @@ -233,27 +245,12 @@ func New(conf *config.Config, args Args) (*Container, error) { } // Create and join cgroup before processes are created to ensure they are // part of the cgroup from the start (and all their children processes). - cg, err := cgroup.NewFromSpec(args.Spec) + parentCgroup, subCgroup, err := c.setupCgroupForRoot(conf, args.Spec) if err != nil { return nil, err } - if cg != nil { - // TODO(gvisor.dev/issue/3481): Remove when cgroups v2 is supported. - if !conf.Rootless && cgroup.IsOnlyV2() { - return nil, fmt.Errorf("cgroups V2 is not yet supported. Enable cgroups V1 and retry") - } - // If there is cgroup config, install it before creating sandbox process. - if err := cg.Install(args.Spec.Linux.Resources); err != nil { - switch { - case errors.Is(err, unix.EACCES) && conf.Rootless: - log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) - cg = nil - default: - return nil, fmt.Errorf("configuring cgroup: %v", err) - } - } - } - if err := runInCgroup(cg, func() error { + c.CompatCgroup = subCgroup + if err := runInCgroup(parentCgroup, func() error { ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached) if err != nil { return err @@ -269,7 +266,7 @@ func New(conf *config.Config, args Args) (*Container, error) { UserLog: args.UserLog, IOFiles: ioFiles, MountsFile: specFile, - Cgroup: cg, + Cgroup: parentCgroup, Attached: args.Attached, } sand, err := sandbox.New(conf, sandArgs) @@ -296,6 +293,12 @@ func New(conf *config.Config, args Args) (*Container, error) { } c.Sandbox = sb.Sandbox + subCgroup, err := c.setupCgroupForSubcontainer(conf, args.Spec) + if err != nil { + return nil, err + } + c.CompatCgroup = subCgroup + // If the console control socket file is provided, then create a new // pty master/slave pair and send the TTY to the sandbox process. var tty *os.File @@ -781,16 +784,16 @@ func (c *Container) saveLocked() error { // root containers), and waits for the container or sandbox and the gofer // to stop. If any of them doesn't stop before timeout, an error is returned. func (c *Container) stop() error { - var cgroup *cgroup.Cgroup + var parentCgroup *cgroup.Cgroup if c.Sandbox != nil { log.Debugf("Destroying container, cid: %s", c.ID) if err := c.Sandbox.DestroyContainer(c.ID); err != nil { return fmt.Errorf("destroying container %q: %v", c.ID, err) } - // Only uninstall cgroup for sandbox stop. + // Only uninstall parentCgroup for sandbox stop. if c.Sandbox.IsRootContainer(c.ID) { - cgroup = c.Sandbox.Cgroup + parentCgroup = c.Sandbox.Cgroup } // Only set sandbox to nil after it has been told to destroy the container. c.Sandbox = nil @@ -809,9 +812,16 @@ func (c *Container) stop() error { return err } - // Gofer is running in cgroups, so Cgroup.Uninstall has to be called after it. - if cgroup != nil { - if err := cgroup.Uninstall(); err != nil { + // Delete container cgroup if any. + if c.CompatCgroup != nil { + if err := c.CompatCgroup.Uninstall(); err != nil { + return err + } + } + // Gofer is running inside parentCgroup, so Cgroup.Uninstall has to be called + // after the gofer has stopped. + if parentCgroup != nil { + if err := parentCgroup.Uninstall(); err != nil { return err } } @@ -1208,3 +1218,77 @@ func (c *Container) populateStats(event *boot.EventOut) { event.Event.Data.CPU.Usage.Total = uint64(total) return } + +// setupCgroupForRoot configures and returns cgroup for the sandbox and the +// root container. If `cgroupParentAnnotation` is set, use that path as the +// sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup. +func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (*cgroup.Cgroup, *cgroup.Cgroup, error) { + var parentCgroup *cgroup.Cgroup + if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok { + var err error + parentCgroup, err = cgroup.NewFromPath(parentPath) + if err != nil { + return nil, nil, err + } + } else { + var err error + parentCgroup, err = cgroup.NewFromSpec(spec) + if parentCgroup == nil || err != nil { + return nil, nil, err + } + } + + var err error + parentCgroup, err = cgroupInstall(conf, parentCgroup, spec.Linux.Resources) + if parentCgroup == nil || err != nil { + return nil, nil, err + } + + subCgroup, err := c.setupCgroupForSubcontainer(conf, spec) + if err != nil { + _ = parentCgroup.Uninstall() + return nil, nil, err + } + return parentCgroup, subCgroup, nil +} + +// setupCgroupForSubcontainer sets up empty cgroups for subcontainers. Since +// subcontainers run exclusively inside the sandbox, subcontainer cgroups on the +// host have no effect on them. However, some tools (e.g. cAdvisor) uses cgroups +// paths to discover new containers and report stats for them. +func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.Spec) (*cgroup.Cgroup, error) { + if isRoot(spec) { + if _, ok := spec.Annotations[cgroupParentAnnotation]; !ok { + return nil, nil + } + } + + cg, err := cgroup.NewFromSpec(spec) + if cg == nil || err != nil { + return nil, err + } + // Use empty resources, just want the directory structure created. + return cgroupInstall(conf, cg, &specs.LinuxResources{}) +} + +// cgroupInstall creates cgroups dir structure and sets their respective +// resources. In case of success, returns the cgroups instance and nil error. +// For rootless, it's possible that cgroups operations fail, in this case the +// error is suppressed and a nil cgroups instance is returned to indicate that +// no cgroups was configured. +func cgroupInstall(conf *config.Config, cg *cgroup.Cgroup, res *specs.LinuxResources) (*cgroup.Cgroup, error) { + // TODO(gvisor.dev/issue/3481): Remove when cgroups v2 is supported. + if !conf.Rootless && cgroup.IsOnlyV2() { + return nil, fmt.Errorf("cgroups V2 is not yet supported. Enable cgroups V1 and retry") + } + if err := cg.Install(res); err != nil { + switch { + case errors.Is(err, unix.EACCES) && conf.Rootless: + log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) + return nil, nil + default: + return nil, fmt.Errorf("configuring cgroup: %v", err) + } + } + return cg, nil +} |