diff options
author | Fabricio Voznika <fvoznika@google.com> | 2018-10-10 08:59:25 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-10-10 09:00:42 -0700 |
commit | 29cd05a7c66ee8061c0e5cf8e94c4e507dcf33e0 (patch) | |
tree | 91600ea6944d18c86f41b5f8003311a8c7bd154b | |
parent | 20508bafb88d2037ea3b2c8483b191ce72e7ad7e (diff) |
Add sandbox to cgroup
Sandbox creation uses the limits and reservations configured in the
OCI spec and set cgroup options accordinly. Then it puts both the
sandbox and gofer processes inside the cgroup.
It also allows the cgroup to be pre-configured by the caller. If the
cgroup already exists, sandbox and gofer processes will join the
cgroup but it will not modify the cgroup with spec limits.
PiperOrigin-RevId: 216538209
Change-Id: If2c65ffedf55820baab743a0edcfb091b89c1019
-rw-r--r-- | runsc/boot/loader.go | 90 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 11 | ||||
-rw-r--r-- | runsc/cgroup/BUILD | 24 | ||||
-rw-r--r-- | runsc/cgroup/cgroup.go | 405 | ||||
-rw-r--r-- | runsc/cgroup/cgroup_test.go | 56 | ||||
-rw-r--r-- | runsc/cmd/boot.go | 23 | ||||
-rw-r--r-- | runsc/container/container.go | 16 | ||||
-rw-r--r-- | runsc/sandbox/BUILD | 1 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 57 | ||||
-rw-r--r-- | runsc/specutils/BUILD | 1 | ||||
-rw-r--r-- | runsc/specutils/cpu.go | 90 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 37 | ||||
-rw-r--r-- | runsc/test/integration/BUILD | 4 | ||||
-rw-r--r-- | runsc/test/integration/integration_test.go | 84 | ||||
-rw-r--r-- | runsc/test/testutil/docker.go | 9 |
15 files changed, 776 insertions, 132 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 1ad6b09f4..dc3c6c3d0 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,6 +20,7 @@ import ( "math/rand" "os" "os/signal" + "runtime" "sync" "sync/atomic" "syscall" @@ -138,14 +139,39 @@ func init() { kernel.RegisterSyscallTable(slinux.AMD64) } +// Args are the arguments for New(). +type Args struct { + // Id is the sandbox ID. + ID string + // Spec is the sandbox specification. + Spec *specs.Spec + // Conf is the system configuration. + Conf *Config + // ControllerFD is the FD to the URPC controller. + ControllerFD int + // DeviceFD is an optional argument that is passed to the platform. + DeviceFD int + // GoferFDs is an array of FDs used to connect with the Gofer. + GoferFDs []int + // StdioFDs is the stdio for the application. + StdioFDs []int + // Console is set to true if using TTY. + Console bool + // NumCPU is the number of CPUs to create inside the sandbox. + NumCPU int + // TotalMem is the initial amount of total memory to report back to the + // container. + TotalMem uint64 +} + // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. -func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, stdioFDs []int, console bool) (*Loader, error) { +func New(args Args) (*Loader, error) { if err := usage.Init(); err != nil { return nil, fmt.Errorf("error setting up memory usage: %v", err) } // Create kernel and platform. - p, err := createPlatform(conf, deviceFD) + p, err := createPlatform(args.Conf, args.DeviceFD) if err != nil { return nil, fmt.Errorf("error creating platform: %v", err) } @@ -168,7 +194,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, } tk.SetClocks(time.NewCalibratedClocks()) - if err := enableStrace(conf); err != nil { + if err := enableStrace(args.Conf); err != nil { return nil, fmt.Errorf("failed to enable strace: %v", err) } @@ -176,35 +202,41 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside // Run(). - networkStack, err := newEmptyNetworkStack(conf, k) + networkStack, err := newEmptyNetworkStack(args.Conf, k) if err != nil { return nil, fmt.Errorf("failed to create network: %v", err) } // Create capabilities. - caps, err := specutils.Capabilities(spec.Process.Capabilities) + caps, err := specutils.Capabilities(args.Spec.Process.Capabilities) if err != nil { return nil, fmt.Errorf("error creating capabilities: %v", err) } // Convert the spec's additional GIDs to KGIDs. - extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) - for _, GID := range spec.Process.User.AdditionalGids { + extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids)) + for _, GID := range args.Spec.Process.User.AdditionalGids { extraKGIDs = append(extraKGIDs, auth.KGID(GID)) } // Create credentials. creds := auth.NewUserCredentials( - auth.KUID(spec.Process.User.UID), - auth.KGID(spec.Process.User.GID), + auth.KUID(args.Spec.Process.User.UID), + auth.KGID(args.Spec.Process.User.GID), extraKGIDs, caps, auth.NewRootUserNamespace()) - // Get CPU numbers from spec. - cpuNum, err := specutils.CalculateCPUNumber(spec) - if err != nil { - return nil, fmt.Errorf("cannot get cpus from spec: %v", err) + if args.NumCPU == 0 { + args.NumCPU = runtime.NumCPU() + } + log.Infof("CPUs: %d", args.NumCPU) + + if args.TotalMem > 0 { + // Adjust the total memory returned by the Sentry so that applications that + // use /proc/meminfo can make allocations based on this limit. + usage.MinimumTotalMemoryBytes = args.TotalMem + log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30)) } // Initiate the Kernel object, which is required by the Context passed @@ -214,9 +246,9 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, Timekeeper: tk, RootUserNamespace: creds.UserNamespace, NetworkStack: networkStack, - ApplicationCores: uint(cpuNum), + ApplicationCores: uint(args.NumCPU), Vdso: vdso, - RootUTSNamespace: kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace), + RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, "", creds.UserNamespace), RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), }); err != nil { @@ -224,7 +256,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, } // Turn on packet logging if enabled. - if conf.LogPackets { + if args.Conf.LogPackets { log.Infof("Packet logging enabled") atomic.StoreUint32(&sniffer.LogPackets, 1) } else { @@ -233,7 +265,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, } // Create a watchdog. - watchdog := watchdog.New(k, watchdog.DefaultTimeout, conf.WatchdogAction) + watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction) // Create the control server using the provided FD. // @@ -244,7 +276,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, // misconfigured process will cause an error, and we want the control // server up before that so that we don't time out trying to connect to // it. - ctrl, err := newController(controllerFD, k, watchdog) + ctrl, err := newController(args.ControllerFD, k, watchdog) if err != nil { return nil, fmt.Errorf("error creating control server: %v", err) } @@ -255,20 +287,20 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, return nil, fmt.Errorf("failed to ignore child stop signals: %v", err) } // Ensure that signals received are forwarded to the emulated kernel. - ps := syscall.Signal(conf.PanicSignal) + ps := syscall.Signal(args.Conf.PanicSignal) startSignalForwarding := sighandling.PrepareForwarding(k, ps) - if conf.PanicSignal != -1 { - // Panics if the sentry receives 'conf.PanicSignal'. + if args.Conf.PanicSignal != -1 { + // Panics if the sentry receives 'Config.PanicSignal'. panicChan := make(chan os.Signal, 1) signal.Notify(panicChan, ps) go func() { // S/R-SAFE: causes sentry panic. <-panicChan panic("Signal-induced panic") }() - log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal) + log.Infof("Panic signal set to %v(%d)", ps, args.Conf.PanicSignal) } - procArgs, err := newProcess(id, spec, creds, k) + procArgs, err := newProcess(args.ID, args.Spec, creds, k) if err != nil { return nil, fmt.Errorf("failed to create root process: %v", err) } @@ -276,15 +308,15 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, l := &Loader{ k: k, ctrl: ctrl, - conf: conf, - console: console, + conf: args.Conf, + console: args.Console, watchdog: watchdog, - spec: spec, - goferFDs: goferFDs, - stdioFDs: stdioFDs, + spec: args.Spec, + goferFDs: args.GoferFDs, + stdioFDs: args.StdioFDs, startSignalForwarding: startSignalForwarding, rootProcArgs: procArgs, - sandboxID: id, + sandboxID: args.ID, processes: make(map[execID]*execProcess), } ctrl.manager.l = l diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index ea8411a8b..10efa4427 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -102,7 +102,16 @@ func createLoader() (*Loader, func(), error) { } stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())} - l, err := New("foo", spec, conf, fd, -1 /* device fd */, []int{sandEnd}, stdio, false) + args := Args{ + ID: "foo", + Spec: spec, + Conf: conf, + ControllerFD: fd, + DeviceFD: -1, + GoferFDs: []int{sandEnd}, + StdioFDs: stdio, + } + l, err := New(args) if err != nil { cleanup() return nil, nil, err diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD new file mode 100644 index 000000000..4a535d230 --- /dev/null +++ b/runsc/cgroup/BUILD @@ -0,0 +1,24 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "cgroup", + srcs = ["cgroup.go"], + importpath = "gvisor.googlesource.com/gvisor/runsc/cgroup", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/log", + "//runsc/specutils", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) + +go_test( + name = "cgroup_test", + size = "small", + srcs = ["cgroup_test.go"], + embed = [":cgroup"], +) diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go new file mode 100644 index 000000000..6a0092be8 --- /dev/null +++ b/runsc/cgroup/cgroup.go @@ -0,0 +1,405 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cgroup provides an interface to read and write configuration to +// cgroup. +package cgroup + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +const ( + cgroupRoot = "/sys/fs/cgroup" +) + +var controllers = map[string]controller{ + "blkio": &blockIO{}, + "cpu": &cpu{}, + "cpuset": &cpuSet{}, + "memory": &memory{}, + "net_cls": &networkClass{}, + "net_prio": &networkPrio{}, + + // These controllers either don't have anything in the OCI spec or is + // irrevalant for a sandbox, e.g. pids. + "devices": &noop{}, + "freezer": &noop{}, + "perf_event": &noop{}, + "pids": &noop{}, + "systemd": &noop{}, +} + +func setOptionalValueInt(path, name string, val *int64) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatInt(*val, 10) + return setValue(path, name, str) +} + +func setOptionalValueUint(path, name string, val *uint64) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(*val, 10) + return setValue(path, name, str) +} + +func setOptionalValueUint32(path, name string, val *uint32) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(uint64(*val), 10) + return setValue(path, name, str) +} + +func setOptionalValueUint16(path, name string, val *uint16) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(uint64(*val), 10) + return setValue(path, name, str) +} + +func setValue(path, name, data string) error { + fullpath := filepath.Join(path, name) + return ioutil.WriteFile(fullpath, []byte(data), 0700) +} + +func getValue(path, name string) (string, error) { + fullpath := filepath.Join(path, name) + out, err := ioutil.ReadFile(fullpath) + if err != nil { + return "", err + } + return string(out), nil +} + +// fillFromAncestor sets the value of a cgroup file from the first ancestor +// that has content. It does nothing if the file in 'path' has already been set. +func fillFromAncestor(path string) (string, error) { + out, err := ioutil.ReadFile(path) + if err != nil { + return "", err + } + val := strings.TrimSpace(string(out)) + if val != "" { + // File is set, stop here. + return val, nil + } + + // File is not set, recurse to parent and then set here. + name := filepath.Base(path) + parent := filepath.Dir(filepath.Dir(path)) + val, err = fillFromAncestor(filepath.Join(parent, name)) + if err != nil { + return "", err + } + if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil { + return "", err + } + return val, nil +} + +func countCpuset(cpuset string) (int, error) { + var count int + for _, p := range strings.Split(cpuset, ",") { + interval := strings.Split(p, "-") + switch len(interval) { + case 1: + if _, err := strconv.Atoi(interval[0]); err != nil { + return 0, err + } + count++ + + case 2: + start, err := strconv.Atoi(interval[0]) + if err != nil { + return 0, err + } + end, err := strconv.Atoi(interval[1]) + if err != nil { + return 0, err + } + if start < 0 || end < 0 || start > end { + return 0, fmt.Errorf("invalid cpuset: %q", p) + } + count += end - start + 1 + + default: + return 0, fmt.Errorf("invalid cpuset: %q", p) + } + } + return count, nil +} + +// Cgroup represents a group inside all controllers. For example: Name='/foo/bar' +// maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers. +type Cgroup struct { + Name string `json:"name"` + Own bool `json:"own"` +} + +// New creates a new Cgroup instance if the spec includes a cgroup path. +// Otherwise it returns nil and false. +func New(spec *specs.Spec) (*Cgroup, bool) { + if spec.Linux == nil || spec.Linux.CgroupsPath == "" { + return nil, false + } + return &Cgroup{Name: spec.Linux.CgroupsPath}, true +} + +// Install creates and configures cgroups according to 'res'. If cgroup path +// already exists, it means that the caller has already provided a +// pre-configured cgroups, and 'res' is ignored. +func (c *Cgroup) Install(res *specs.LinuxResources) error { + if _, err := os.Stat(c.makePath("memory")); err == nil { + // If cgroup has already been created; it has been setup by caller. Don't + // make any changes to configuration, just join when sandbox/gofer starts. + log.Debugf("Using pre-created cgroup %q", c.Name) + return nil + } + + // Mark that cgroup resources are owned by me. + log.Debugf("Creating cgroup %q", c.Name) + c.Own = true + clean := specutils.MakeCleanup(func() { c.Uninstall() }) + defer clean.Clean() + + for key, ctrl := range controllers { + path := c.makePath(key) + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + if res != nil { + if err := ctrl.set(res, path); err != nil { + return err + } + } + } + clean.Release() + return nil +} + +// Uninstall removes the settings done in Install(). If cgroup path already +// existed when Install() was called, Uninstall is a noop. +func (c *Cgroup) Uninstall() error { + if !c.Own { + // cgroup is managed by caller, don't touch it. + return nil + } + log.Debugf("Deleting cgroup %q", c.Name) + for key := range controllers { + if err := syscall.Rmdir(c.makePath(key)); err != nil && !os.IsNotExist(err) { + return err + } + } + return nil +} + +// Add adds given process to all controllers. +func (c *Cgroup) Add(pid int) error { + for key := range controllers { + if err := setValue(c.makePath(key), "cgroup.procs", strconv.Itoa(pid)); err != nil { + return err + } + } + return nil +} + +// NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. +func (c *Cgroup) NumCPU() (int, error) { + path := c.makePath("cpuset") + cpuset, err := getValue(path, "cpuset.cpus") + if err != nil { + return 0, err + } + return countCpuset(strings.TrimSpace(cpuset)) +} + +// MemoryLimit returns the memory limit. +func (c *Cgroup) MemoryLimit() (uint64, error) { + path := c.makePath("memory") + limStr, err := getValue(path, "memory.limit_in_bytes") + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64) +} + +func (c *Cgroup) makePath(controllerName string) string { + return filepath.Join(cgroupRoot, controllerName, c.Name) +} + +type controller interface { + set(*specs.LinuxResources, string) error +} + +type noop struct{} + +func (*noop) set(*specs.LinuxResources, string) error { + return nil +} + +type memory struct{} + +func (*memory) set(spec *specs.LinuxResources, path string) error { + if spec.Memory == nil { + return nil + } + if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.soft_limit_in_bytes", spec.Memory.Reservation); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.memsw.limit_in_bytes", spec.Memory.Swap); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.kmem.limit_in_bytes", spec.Memory.Kernel); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.kmem.tcp.limit_in_bytes", spec.Memory.KernelTCP); err != nil { + return err + } + if err := setOptionalValueUint(path, "memory.swappiness", spec.Memory.Swappiness); err != nil { + return err + } + + if spec.Memory.DisableOOMKiller != nil && *spec.Memory.DisableOOMKiller { + if err := setValue(path, "memory.oom_control", "1"); err != nil { + return err + } + } + return nil +} + +type cpu struct{} + +func (*cpu) set(spec *specs.LinuxResources, path string) error { + if spec.CPU == nil { + return nil + } + if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil { + return err + } + if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil { + return err + } + return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period) +} + +type cpuSet struct{} + +func (*cpuSet) set(spec *specs.LinuxResources, path string) error { + // cpuset.cpus and mems are required fields, but are not set on a new cgroup. + // If not set in the spec, get it from one of the ancestors cgroup. + if spec.CPU == nil || spec.CPU.Cpus == "" { + if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil { + return err + } + } else { + if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil { + return err + } + } + + if spec.CPU == nil || spec.CPU.Mems == "" { + _, err := fillFromAncestor(filepath.Join(path, "cpuset.mems")) + return err + } + mems := spec.CPU.Mems + return setValue(path, "cpuset.mems", mems) +} + +type blockIO struct{} + +func (*blockIO) set(spec *specs.LinuxResources, path string) error { + if spec.BlockIO == nil { + return nil + } + + if err := setOptionalValueUint16(path, "blkio.weight", spec.BlockIO.Weight); err != nil { + return err + } + if err := setOptionalValueUint16(path, "blkio.leaf_weight", spec.BlockIO.LeafWeight); err != nil { + return err + } + + for _, dev := range spec.BlockIO.WeightDevice { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight) + if err := setValue(path, "blkio.weight_device", val); err != nil { + return err + } + val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight) + if err := setValue(path, "blkio.leaf_weight_device", val); err != nil { + return err + } + } + if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil { + return err + } + if err := setThrottle(path, "blkio.throttle.write_bps_device", spec.BlockIO.ThrottleWriteBpsDevice); err != nil { + return err + } + if err := setThrottle(path, "blkio.throttle.read_iops_device", spec.BlockIO.ThrottleReadIOPSDevice); err != nil { + return err + } + return setThrottle(path, "blkio.throttle.write_iops_device", spec.BlockIO.ThrottleWriteIOPSDevice) +} + +func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error { + for _, dev := range devs { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate) + if err := setValue(path, name, val); err != nil { + return err + } + } + return nil +} + +type networkClass struct{} + +func (*networkClass) set(spec *specs.LinuxResources, path string) error { + if spec.Network == nil { + return nil + } + return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID) +} + +type networkPrio struct{} + +func (*networkPrio) set(spec *specs.LinuxResources, path string) error { + if spec.Network == nil { + return nil + } + for _, prio := range spec.Network.Priorities { + val := fmt.Sprintf("%s %d", prio.Name, prio.Priority) + if err := setValue(path, "net_prio.ifpriomap", val); err != nil { + return err + } + } + return nil +} diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go new file mode 100644 index 000000000..cde915329 --- /dev/null +++ b/runsc/cgroup/cgroup_test.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "testing" +) + +func TestCountCpuset(t *testing.T) { + for _, tc := range []struct { + str string + want int + error bool + }{ + {str: "0", want: 1}, + {str: "0,1,2,8,9,10", want: 6}, + {str: "0-1", want: 2}, + {str: "0-7", want: 8}, + {str: "0-7,16,32-39,64,65", want: 19}, + {str: "a", error: true}, + {str: "5-a", error: true}, + {str: "a-5", error: true}, + {str: "-10", error: true}, + {str: "15-", error: true}, + {str: "-", error: true}, + {str: "--", error: true}, + } { + t.Run(tc.str, func(t *testing.T) { + got, err := countCpuset(tc.str) + if tc.error { + if err == nil { + t.Errorf("countCpuset(%q) should have failed", tc.str) + } + } else { + if err != nil { + t.Errorf("countCpuset(%q) failed: %v", tc.str, err) + } + if tc.want != got { + t.Errorf("countCpuset(%q) want: %d, got: %d", tc.str, tc.want, got) + } + } + }) + } +} diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index c6f78f63f..d26e92bcd 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -59,6 +59,13 @@ type Boot struct { // applyCaps determines if capabilities defined in the spec should be applied // to the process. applyCaps bool + + // cpuNum number of CPUs to create inside the sandbox. + cpuNum int + + // totalMem sets the initial amount of total memory to report back to the + // container. + totalMem uint64 } // Name implements subcommands.Command.Name. @@ -86,6 +93,8 @@ func (b *Boot) SetFlags(f *flag.FlagSet) { f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls") f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") + f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") + f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") } // Execute implements subcommands.Command.Execute. It starts a sandbox in a @@ -143,7 +152,19 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } // Create the loader. - l, err := boot.New(f.Arg(0), spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.stdioFDs.GetArray(), b.console) + bootArgs := boot.Args{ + ID: f.Arg(0), + Spec: spec, + Conf: conf, + ControllerFD: b.controllerFD, + DeviceFD: b.deviceFD, + GoferFDs: b.ioFDs.GetArray(), + StdioFDs: b.stdioFDs.GetArray(), + Console: b.console, + NumCPU: b.cpuNum, + TotalMem: b.totalMem, + } + l, err := boot.New(bootArgs) if err != nil { Fatalf("error creating loader: %v", err) } diff --git a/runsc/container/container.go b/runsc/container/container.go index f0cdee8d3..eaa62daf1 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -262,6 +262,8 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo Status: Creating, Owner: os.Getenv("USER"), } + cu := specutils.MakeCleanup(func() { c.Destroy() }) + defer cu.Clean() // If the metadata annotations indicate that this container should be // started in an existing sandbox, we must do so. The metadata will @@ -276,12 +278,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // Start a new sandbox for this container. Any errors after this point // must destroy the container. - s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles) + c.Sandbox, err = sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles) if err != nil { - c.Destroy() return nil, err } - c.Sandbox = s + if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil { + return nil, err + } } else { // This is sort of confusing. For a sandbox with a root // container and a child container in it, runsc sees: @@ -300,7 +303,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // Find the sandbox associated with this ID. sb, err := Load(conf.RootDir, sbid) if err != nil { - c.Destroy() return nil, err } c.Sandbox = sb.Sandbox @@ -309,7 +311,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // Save the metadata file. if err := c.save(); err != nil { - c.Destroy() return nil, err } @@ -317,11 +318,11 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // this file is created, so it must be the last thing we do. if pidFile != "" { if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil { - c.Destroy() return nil, fmt.Errorf("error writing PID file: %v", err) } } + cu.Release() return c, nil } @@ -358,6 +359,9 @@ func (c *Container) Start(conf *boot.Config) error { if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil { return err } + if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil { + return err + } } // "If any poststart hook fails, the runtime MUST log a warning, but diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 09965dcc0..eb9c4cd76 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/platform/kvm", "//pkg/urpc", "//runsc/boot", + "//runsc/cgroup", "//runsc/console", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 847417a15..26d725bdd 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -34,6 +34,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" "gvisor.googlesource.com/gvisor/pkg/urpc" "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/cgroup" "gvisor.googlesource.com/gvisor/runsc/console" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -58,12 +59,26 @@ type Sandbox struct { // Chroot is the path to the chroot directory that the sandbox process // is running in. Chroot string `json:"chroot"` + + // Ccroup has the cgroup configuration for the sandbox. + Cgroup *cgroup.Cgroup `json:"cgroup"` } // Create creates the sandbox process. The caller must call Destroy() on the // sandbox. func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) { s := &Sandbox{ID: id} + c := specutils.MakeCleanup(func() { s.destroy() }) + defer c.Clean() + + if cg, ok := cgroup.New(spec); ok { + s.Cgroup = cg + + // If there is cgroup config, install it before creating sandbox process. + if err := s.Cgroup.Install(spec.Linux.Resources); err != nil { + return nil, fmt.Errorf("error configuring cgroup: %v", err) + } + } // Create the sandbox process. if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, ioFiles); err != nil { @@ -75,6 +90,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo return nil, err } + if s.Cgroup != nil { + if err := s.Cgroup.Add(s.Pid); err != nil { + return nil, fmt.Errorf("error adding sandbox to cgroup: %v", err) + } + } + + c.Release() return s, nil } @@ -483,6 +505,24 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund } } + if s.Cgroup != nil { + cpuNum, err := s.Cgroup.NumCPU() + if err != nil { + return fmt.Errorf("error getting cpu count from cgroups: %v", err) + } + cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) + + mem, err := s.Cgroup.MemoryLimit() + if err != nil { + return fmt.Errorf("error getting memory limit from cgroups: %v", err) + } + // When memory limit is unset, a "large" number is returned. In that case, + // just stick with the default. + if mem < 0x7ffffffffffff000 { + cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) + } + } + // Add container as the last argument. cmd.Args = append(cmd.Args, s.ID) @@ -590,8 +630,15 @@ func (s *Sandbox) destroy() error { } } + if s.Cgroup != nil { + if err := s.Cgroup.Uninstall(); err != nil { + return err + } + } if s.Chroot != "" { - return tearDownChroot(s.Chroot) + if err := tearDownChroot(s.Chroot); err != nil { + return err + } } return nil @@ -761,6 +808,14 @@ func (s *Sandbox) waitForStopped() error { return backoff.Retry(op, b) } +// AddGoferToCgroup adds the gofer process to the sandbox's cgroup. +func (s *Sandbox) AddGoferToCgroup(pid int) error { + if s.Cgroup != nil { + return s.Cgroup.Add(pid) + } + return nil +} + // deviceFileForPlatform opens the device file for the given platform. If the // platform does not need a device file, then nil is returned. func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) { diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index f1a99ce48..e73b2293f 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "specutils", srcs = [ - "cpu.go", "namespace.go", "specutils.go", ], diff --git a/runsc/specutils/cpu.go b/runsc/specutils/cpu.go deleted file mode 100644 index 9abe26b64..000000000 --- a/runsc/specutils/cpu.go +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2018 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package specutils - -import ( - "fmt" - "runtime" - "strconv" - "strings" - - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -// CalculateCPUNumber calculates the number of CPUs that should be exposed -// inside the sandbox. -func CalculateCPUNumber(spec *specs.Spec) (int, error) { - // If spec does not contain CPU field, then return the number of host CPUs. - if spec == nil || spec.Linux == nil || spec.Linux.Resources == nil || spec.Linux.Resources.CPU == nil { - return runtime.NumCPU(), nil - } - cpuSpec := spec.Linux.Resources.CPU - - // If cpuSpec.Cpus is specified, then parse and return that. They must be in - // the list format for cpusets, which is "a comma-separated list of CPU - // numbers and ranges of numbers, in ASCII decimal." --man 7 cpuset. - cpus := cpuSpec.Cpus - if cpus != "" { - cpuNum := 0 - for _, subs := range strings.Split(cpus, ",") { - result, err := parseCPUNumber(subs) - if err != nil { - return 0, err - } - cpuNum += result - } - return cpuNum, nil - } - - // If CPU.Quota and CPU.Period are specified, we can divide them to get an - // approximation of the number of CPUs needed. - if cpuSpec.Quota != nil && cpuSpec.Period != nil && *cpuSpec.Period != 0 { - cpuQuota := *cpuSpec.Quota - cpuPeriod := *cpuSpec.Period - return int(cpuQuota)/int(cpuPeriod) + 1, nil - } - - // Default to number of host cpus. - return runtime.NumCPU(), nil -} - -// parseCPUNumber converts a cpuset string into the number of cpus included in -// the string , e.g. "3-6" -> 4. -func parseCPUNumber(cpus string) (int, error) { - switch cpusSlice := strings.Split(cpus, "-"); len(cpusSlice) { - case 1: - // cpus is not a range. We must only check that it is a valid number. - if _, err := strconv.Atoi(cpus); err != nil { - return 0, fmt.Errorf("invalid individual cpu number %q", cpus) - } - return 1, nil - case 2: - // cpus is a range. We must check that start and end are valid numbers, - // and calculate their difference (inclusively). - first, err := strconv.Atoi(cpusSlice[0]) - if err != nil || first < 0 { - return 0, fmt.Errorf("invalid first cpu number %q in range %q", cpusSlice[0], cpus) - } - last, err := strconv.Atoi(cpusSlice[1]) - if err != nil || last < 0 { - return 0, fmt.Errorf("invalid last cpu number %q in range %q", cpusSlice[1], cpus) - } - cpuNum := last - first + 1 - if cpuNum <= 0 { - return 0, fmt.Errorf("cpu range %q does not include positive number of cpus", cpus) - } - } - return 0, fmt.Errorf("invalid cpu string %q", cpus) -} diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index daf10b875..ac017ba2d 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -43,6 +43,13 @@ func LogSpec(spec *specs.Spec) { log.Debugf("Spec: %+v", spec) log.Debugf("Spec.Hooks: %+v", spec.Hooks) log.Debugf("Spec.Linux: %+v", spec.Linux) + if spec.Linux != nil && spec.Linux.Resources != nil { + res := spec.Linux.Resources + log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory) + log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU) + log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO) + log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network) + } log.Debugf("Spec.Process: %+v", spec.Process) log.Debugf("Spec.Root: %+v", spec.Root) } @@ -402,3 +409,33 @@ func ContainsStr(strs []string, str string) bool { } return false } + +// Cleanup allows defers to be aborted when cleanup needs to happen +// conditionally. Usage: +// c := MakeCleanup(func() { f.Close() }) +// defer c.Clean() // any failure before release is called will close the file. +// ... +// c.Release() // on success, aborts closing the file and return it. +// return f +type Cleanup struct { + clean func() + released bool +} + +// MakeCleanup creates a new Cleanup object. +func MakeCleanup(f func()) Cleanup { + return Cleanup{clean: f} +} + +// Clean calls the cleanup function. +func (c *Cleanup) Clean() { + if !c.released { + c.clean() + } +} + +// Release releases the cleanup from its duties, i.e. cleanup function is not +// called after this point. +func (c *Cleanup) Release() { + c.released = true +} diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD index 4407016ad..726ebf49e 100644 --- a/runsc/test/integration/BUILD +++ b/runsc/test/integration/BUILD @@ -15,9 +15,7 @@ go_test( "manual", "local", ], - deps = [ - "//runsc/test/testutil", - ], + deps = ["//runsc/test/testutil"], ) go_library( diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go index 5f24aeed5..5480c5bbe 100644 --- a/runsc/test/integration/integration_test.go +++ b/runsc/test/integration/integration_test.go @@ -26,6 +26,7 @@ import ( "net" "net/http" "os" + "strconv" "strings" "testing" "time" @@ -179,6 +180,89 @@ func TestConnectToSelf(t *testing.T) { } } +func TestMemLimit(t *testing.T) { + if err := testutil.Pull("alpine"); err != nil { + t.Fatal("docker pull failed:", err) + } + d := testutil.MakeDocker("cgroup-test") + cmd := "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'" + out, err := d.RunFg("--memory=500MB", "alpine", "sh", "-c", cmd) + if err != nil { + t.Fatal("docker run failed:", err) + } + defer d.CleanUp() + + // Remove warning message that swap isn't present. + if strings.HasPrefix(out, "WARNING") { + lines := strings.Split(out, "\n") + if len(lines) != 3 { + t.Fatalf("invalid output: %s", out) + } + out = lines[1] + } + + got, err := strconv.ParseUint(strings.TrimSpace(out), 10, 64) + if err != nil { + t.Fatalf("failed to parse %q: %v", out, err) + } + if want := uint64(500 * 1024); got != want { + t.Errorf("MemTotal got: %d, want: %d", got, want) + } +} + +func TestNumCPU(t *testing.T) { + if err := testutil.Pull("alpine"); err != nil { + t.Fatal("docker pull failed:", err) + } + d := testutil.MakeDocker("cgroup-test") + cmd := "cat /proc/cpuinfo | grep 'processor.*:' | wc -l" + out, err := d.RunFg("--cpuset-cpus=0", "alpine", "sh", "-c", cmd) + if err != nil { + t.Fatal("docker run failed:", err) + } + defer d.CleanUp() + + got, err := strconv.Atoi(strings.TrimSpace(out)) + if err != nil { + t.Fatalf("failed to parse %q: %v", out, err) + } + if want := 1; got != want { + t.Errorf("MemTotal got: %d, want: %d", got, want) + } +} + +// TestCgroup sets cgroup options and checks that container can start. +// TODO: Verify that these were set to cgroup on the host. +func TestCgroup(t *testing.T) { + if err := testutil.Pull("alpine"); err != nil { + t.Fatal("docker pull failed:", err) + } + d := testutil.MakeDocker("cgroup-test") + + var args []string + args = append(args, "--cpu-shares=1000") + args = append(args, "--cpu-period=2000") + args = append(args, "--cpu-quota=3000") + args = append(args, "--cpuset-cpus=0") + args = append(args, "--cpuset-mems=0") + args = append(args, "--kernel-memory=100MB") + args = append(args, "--memory=1GB") + args = append(args, "--memory-reservation=500MB") + args = append(args, "--memory-swap=2GB") + args = append(args, "--memory-swappiness=5") + args = append(args, "--blkio-weight=750") + + args = append(args, "hello-world") + if err := d.Run(args...); err != nil { + t.Fatal("docker create failed:", err) + } + defer d.CleanUp() + + if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil { + t.Fatalf("docker didn't say hello: %v", err) + } +} + func TestMain(m *testing.M) { testutil.EnsureSupportedDockerVersion() os.Exit(m.Run()) diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go index d70b4377a..2f15ab818 100644 --- a/runsc/test/testutil/docker.go +++ b/runsc/test/testutil/docker.go @@ -198,6 +198,15 @@ func (d *Docker) Run(args ...string) error { return err } +// RunFg calls 'docker run' with the arguments provided in the foreground. It +// blocks until the container exits and returns the output. +func (d *Docker) RunFg(args ...string) (string, error) { + a := []string{"run", "--runtime", d.Runtime, "--name", d.Name} + a = append(a, args...) + out, err := do(a...) + return string(out), err +} + // Logs calls 'docker logs'. func (d *Docker) Logs() (string, error) { return do("logs", d.Name) |