diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/boot/loader.go | 90 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 11 | ||||
-rw-r--r-- | runsc/cgroup/BUILD | 24 | ||||
-rw-r--r-- | runsc/cgroup/cgroup.go | 405 | ||||
-rw-r--r-- | runsc/cgroup/cgroup_test.go | 56 | ||||
-rw-r--r-- | runsc/cmd/boot.go | 23 | ||||
-rw-r--r-- | runsc/container/container.go | 16 | ||||
-rw-r--r-- | runsc/sandbox/BUILD | 1 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 57 | ||||
-rw-r--r-- | runsc/specutils/BUILD | 1 | ||||
-rw-r--r-- | runsc/specutils/cpu.go | 90 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 37 | ||||
-rw-r--r-- | runsc/test/integration/BUILD | 4 | ||||
-rw-r--r-- | runsc/test/integration/integration_test.go | 84 | ||||
-rw-r--r-- | runsc/test/testutil/docker.go | 9 |
15 files changed, 776 insertions, 132 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 1ad6b09f4..dc3c6c3d0 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,6 +20,7 @@ import ( "math/rand" "os" "os/signal" + "runtime" "sync" "sync/atomic" "syscall" @@ -138,14 +139,39 @@ func init() { kernel.RegisterSyscallTable(slinux.AMD64) } +// Args are the arguments for New(). +type Args struct { + // Id is the sandbox ID. + ID string + // Spec is the sandbox specification. + Spec *specs.Spec + // Conf is the system configuration. + Conf *Config + // ControllerFD is the FD to the URPC controller. + ControllerFD int + // DeviceFD is an optional argument that is passed to the platform. + DeviceFD int + // GoferFDs is an array of FDs used to connect with the Gofer. + GoferFDs []int + // StdioFDs is the stdio for the application. + StdioFDs []int + // Console is set to true if using TTY. + Console bool + // NumCPU is the number of CPUs to create inside the sandbox. + NumCPU int + // TotalMem is the initial amount of total memory to report back to the + // container. + TotalMem uint64 +} + // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. -func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, goferFDs []int, stdioFDs []int, console bool) (*Loader, error) { +func New(args Args) (*Loader, error) { if err := usage.Init(); err != nil { return nil, fmt.Errorf("error setting up memory usage: %v", err) } // Create kernel and platform. - p, err := createPlatform(conf, deviceFD) + p, err := createPlatform(args.Conf, args.DeviceFD) if err != nil { return nil, fmt.Errorf("error creating platform: %v", err) } @@ -168,7 +194,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, } tk.SetClocks(time.NewCalibratedClocks()) - if err := enableStrace(conf); err != nil { + if err := enableStrace(args.Conf); err != nil { return nil, fmt.Errorf("failed to enable strace: %v", err) } @@ -176,35 +202,41 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside // Run(). - networkStack, err := newEmptyNetworkStack(conf, k) + networkStack, err := newEmptyNetworkStack(args.Conf, k) if err != nil { return nil, fmt.Errorf("failed to create network: %v", err) } // Create capabilities. - caps, err := specutils.Capabilities(spec.Process.Capabilities) + caps, err := specutils.Capabilities(args.Spec.Process.Capabilities) if err != nil { return nil, fmt.Errorf("error creating capabilities: %v", err) } // Convert the spec's additional GIDs to KGIDs. - extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) - for _, GID := range spec.Process.User.AdditionalGids { + extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids)) + for _, GID := range args.Spec.Process.User.AdditionalGids { extraKGIDs = append(extraKGIDs, auth.KGID(GID)) } // Create credentials. creds := auth.NewUserCredentials( - auth.KUID(spec.Process.User.UID), - auth.KGID(spec.Process.User.GID), + auth.KUID(args.Spec.Process.User.UID), + auth.KGID(args.Spec.Process.User.GID), extraKGIDs, caps, auth.NewRootUserNamespace()) - // Get CPU numbers from spec. - cpuNum, err := specutils.CalculateCPUNumber(spec) - if err != nil { - return nil, fmt.Errorf("cannot get cpus from spec: %v", err) + if args.NumCPU == 0 { + args.NumCPU = runtime.NumCPU() + } + log.Infof("CPUs: %d", args.NumCPU) + + if args.TotalMem > 0 { + // Adjust the total memory returned by the Sentry so that applications that + // use /proc/meminfo can make allocations based on this limit. + usage.MinimumTotalMemoryBytes = args.TotalMem + log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30)) } // Initiate the Kernel object, which is required by the Context passed @@ -214,9 +246,9 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, Timekeeper: tk, RootUserNamespace: creds.UserNamespace, NetworkStack: networkStack, - ApplicationCores: uint(cpuNum), + ApplicationCores: uint(args.NumCPU), Vdso: vdso, - RootUTSNamespace: kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace), + RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, "", creds.UserNamespace), RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), }); err != nil { @@ -224,7 +256,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, } // Turn on packet logging if enabled. - if conf.LogPackets { + if args.Conf.LogPackets { log.Infof("Packet logging enabled") atomic.StoreUint32(&sniffer.LogPackets, 1) } else { @@ -233,7 +265,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, } // Create a watchdog. - watchdog := watchdog.New(k, watchdog.DefaultTimeout, conf.WatchdogAction) + watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction) // Create the control server using the provided FD. // @@ -244,7 +276,7 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, // misconfigured process will cause an error, and we want the control // server up before that so that we don't time out trying to connect to // it. - ctrl, err := newController(controllerFD, k, watchdog) + ctrl, err := newController(args.ControllerFD, k, watchdog) if err != nil { return nil, fmt.Errorf("error creating control server: %v", err) } @@ -255,20 +287,20 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, return nil, fmt.Errorf("failed to ignore child stop signals: %v", err) } // Ensure that signals received are forwarded to the emulated kernel. - ps := syscall.Signal(conf.PanicSignal) + ps := syscall.Signal(args.Conf.PanicSignal) startSignalForwarding := sighandling.PrepareForwarding(k, ps) - if conf.PanicSignal != -1 { - // Panics if the sentry receives 'conf.PanicSignal'. + if args.Conf.PanicSignal != -1 { + // Panics if the sentry receives 'Config.PanicSignal'. panicChan := make(chan os.Signal, 1) signal.Notify(panicChan, ps) go func() { // S/R-SAFE: causes sentry panic. <-panicChan panic("Signal-induced panic") }() - log.Infof("Panic signal set to %v(%d)", ps, conf.PanicSignal) + log.Infof("Panic signal set to %v(%d)", ps, args.Conf.PanicSignal) } - procArgs, err := newProcess(id, spec, creds, k) + procArgs, err := newProcess(args.ID, args.Spec, creds, k) if err != nil { return nil, fmt.Errorf("failed to create root process: %v", err) } @@ -276,15 +308,15 @@ func New(id string, spec *specs.Spec, conf *Config, controllerFD, deviceFD int, l := &Loader{ k: k, ctrl: ctrl, - conf: conf, - console: console, + conf: args.Conf, + console: args.Console, watchdog: watchdog, - spec: spec, - goferFDs: goferFDs, - stdioFDs: stdioFDs, + spec: args.Spec, + goferFDs: args.GoferFDs, + stdioFDs: args.StdioFDs, startSignalForwarding: startSignalForwarding, rootProcArgs: procArgs, - sandboxID: id, + sandboxID: args.ID, processes: make(map[execID]*execProcess), } ctrl.manager.l = l diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index ea8411a8b..10efa4427 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -102,7 +102,16 @@ func createLoader() (*Loader, func(), error) { } stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())} - l, err := New("foo", spec, conf, fd, -1 /* device fd */, []int{sandEnd}, stdio, false) + args := Args{ + ID: "foo", + Spec: spec, + Conf: conf, + ControllerFD: fd, + DeviceFD: -1, + GoferFDs: []int{sandEnd}, + StdioFDs: stdio, + } + l, err := New(args) if err != nil { cleanup() return nil, nil, err diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD new file mode 100644 index 000000000..4a535d230 --- /dev/null +++ b/runsc/cgroup/BUILD @@ -0,0 +1,24 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "cgroup", + srcs = ["cgroup.go"], + importpath = "gvisor.googlesource.com/gvisor/runsc/cgroup", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/log", + "//runsc/specutils", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) + +go_test( + name = "cgroup_test", + size = "small", + srcs = ["cgroup_test.go"], + embed = [":cgroup"], +) diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go new file mode 100644 index 000000000..6a0092be8 --- /dev/null +++ b/runsc/cgroup/cgroup.go @@ -0,0 +1,405 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cgroup provides an interface to read and write configuration to +// cgroup. +package cgroup + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +const ( + cgroupRoot = "/sys/fs/cgroup" +) + +var controllers = map[string]controller{ + "blkio": &blockIO{}, + "cpu": &cpu{}, + "cpuset": &cpuSet{}, + "memory": &memory{}, + "net_cls": &networkClass{}, + "net_prio": &networkPrio{}, + + // These controllers either don't have anything in the OCI spec or is + // irrevalant for a sandbox, e.g. pids. + "devices": &noop{}, + "freezer": &noop{}, + "perf_event": &noop{}, + "pids": &noop{}, + "systemd": &noop{}, +} + +func setOptionalValueInt(path, name string, val *int64) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatInt(*val, 10) + return setValue(path, name, str) +} + +func setOptionalValueUint(path, name string, val *uint64) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(*val, 10) + return setValue(path, name, str) +} + +func setOptionalValueUint32(path, name string, val *uint32) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(uint64(*val), 10) + return setValue(path, name, str) +} + +func setOptionalValueUint16(path, name string, val *uint16) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(uint64(*val), 10) + return setValue(path, name, str) +} + +func setValue(path, name, data string) error { + fullpath := filepath.Join(path, name) + return ioutil.WriteFile(fullpath, []byte(data), 0700) +} + +func getValue(path, name string) (string, error) { + fullpath := filepath.Join(path, name) + out, err := ioutil.ReadFile(fullpath) + if err != nil { + return "", err + } + return string(out), nil +} + +// fillFromAncestor sets the value of a cgroup file from the first ancestor +// that has content. It does nothing if the file in 'path' has already been set. +func fillFromAncestor(path string) (string, error) { + out, err := ioutil.ReadFile(path) + if err != nil { + return "", err + } + val := strings.TrimSpace(string(out)) + if val != "" { + // File is set, stop here. + return val, nil + } + + // File is not set, recurse to parent and then set here. + name := filepath.Base(path) + parent := filepath.Dir(filepath.Dir(path)) + val, err = fillFromAncestor(filepath.Join(parent, name)) + if err != nil { + return "", err + } + if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil { + return "", err + } + return val, nil +} + +func countCpuset(cpuset string) (int, error) { + var count int + for _, p := range strings.Split(cpuset, ",") { + interval := strings.Split(p, "-") + switch len(interval) { + case 1: + if _, err := strconv.Atoi(interval[0]); err != nil { + return 0, err + } + count++ + + case 2: + start, err := strconv.Atoi(interval[0]) + if err != nil { + return 0, err + } + end, err := strconv.Atoi(interval[1]) + if err != nil { + return 0, err + } + if start < 0 || end < 0 || start > end { + return 0, fmt.Errorf("invalid cpuset: %q", p) + } + count += end - start + 1 + + default: + return 0, fmt.Errorf("invalid cpuset: %q", p) + } + } + return count, nil +} + +// Cgroup represents a group inside all controllers. For example: Name='/foo/bar' +// maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers. +type Cgroup struct { + Name string `json:"name"` + Own bool `json:"own"` +} + +// New creates a new Cgroup instance if the spec includes a cgroup path. +// Otherwise it returns nil and false. +func New(spec *specs.Spec) (*Cgroup, bool) { + if spec.Linux == nil || spec.Linux.CgroupsPath == "" { + return nil, false + } + return &Cgroup{Name: spec.Linux.CgroupsPath}, true +} + +// Install creates and configures cgroups according to 'res'. If cgroup path +// already exists, it means that the caller has already provided a +// pre-configured cgroups, and 'res' is ignored. +func (c *Cgroup) Install(res *specs.LinuxResources) error { + if _, err := os.Stat(c.makePath("memory")); err == nil { + // If cgroup has already been created; it has been setup by caller. Don't + // make any changes to configuration, just join when sandbox/gofer starts. + log.Debugf("Using pre-created cgroup %q", c.Name) + return nil + } + + // Mark that cgroup resources are owned by me. + log.Debugf("Creating cgroup %q", c.Name) + c.Own = true + clean := specutils.MakeCleanup(func() { c.Uninstall() }) + defer clean.Clean() + + for key, ctrl := range controllers { + path := c.makePath(key) + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + if res != nil { + if err := ctrl.set(res, path); err != nil { + return err + } + } + } + clean.Release() + return nil +} + +// Uninstall removes the settings done in Install(). If cgroup path already +// existed when Install() was called, Uninstall is a noop. +func (c *Cgroup) Uninstall() error { + if !c.Own { + // cgroup is managed by caller, don't touch it. + return nil + } + log.Debugf("Deleting cgroup %q", c.Name) + for key := range controllers { + if err := syscall.Rmdir(c.makePath(key)); err != nil && !os.IsNotExist(err) { + return err + } + } + return nil +} + +// Add adds given process to all controllers. +func (c *Cgroup) Add(pid int) error { + for key := range controllers { + if err := setValue(c.makePath(key), "cgroup.procs", strconv.Itoa(pid)); err != nil { + return err + } + } + return nil +} + +// NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. +func (c *Cgroup) NumCPU() (int, error) { + path := c.makePath("cpuset") + cpuset, err := getValue(path, "cpuset.cpus") + if err != nil { + return 0, err + } + return countCpuset(strings.TrimSpace(cpuset)) +} + +// MemoryLimit returns the memory limit. +func (c *Cgroup) MemoryLimit() (uint64, error) { + path := c.makePath("memory") + limStr, err := getValue(path, "memory.limit_in_bytes") + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64) +} + +func (c *Cgroup) makePath(controllerName string) string { + return filepath.Join(cgroupRoot, controllerName, c.Name) +} + +type controller interface { + set(*specs.LinuxResources, string) error +} + +type noop struct{} + +func (*noop) set(*specs.LinuxResources, string) error { + return nil +} + +type memory struct{} + +func (*memory) set(spec *specs.LinuxResources, path string) error { + if spec.Memory == nil { + return nil + } + if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.soft_limit_in_bytes", spec.Memory.Reservation); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.memsw.limit_in_bytes", spec.Memory.Swap); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.kmem.limit_in_bytes", spec.Memory.Kernel); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.kmem.tcp.limit_in_bytes", spec.Memory.KernelTCP); err != nil { + return err + } + if err := setOptionalValueUint(path, "memory.swappiness", spec.Memory.Swappiness); err != nil { + return err + } + + if spec.Memory.DisableOOMKiller != nil && *spec.Memory.DisableOOMKiller { + if err := setValue(path, "memory.oom_control", "1"); err != nil { + return err + } + } + return nil +} + +type cpu struct{} + +func (*cpu) set(spec *specs.LinuxResources, path string) error { + if spec.CPU == nil { + return nil + } + if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil { + return err + } + if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil { + return err + } + return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period) +} + +type cpuSet struct{} + +func (*cpuSet) set(spec *specs.LinuxResources, path string) error { + // cpuset.cpus and mems are required fields, but are not set on a new cgroup. + // If not set in the spec, get it from one of the ancestors cgroup. + if spec.CPU == nil || spec.CPU.Cpus == "" { + if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil { + return err + } + } else { + if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil { + return err + } + } + + if spec.CPU == nil || spec.CPU.Mems == "" { + _, err := fillFromAncestor(filepath.Join(path, "cpuset.mems")) + return err + } + mems := spec.CPU.Mems + return setValue(path, "cpuset.mems", mems) +} + +type blockIO struct{} + +func (*blockIO) set(spec *specs.LinuxResources, path string) error { + if spec.BlockIO == nil { + return nil + } + + if err := setOptionalValueUint16(path, "blkio.weight", spec.BlockIO.Weight); err != nil { + return err + } + if err := setOptionalValueUint16(path, "blkio.leaf_weight", spec.BlockIO.LeafWeight); err != nil { + return err + } + + for _, dev := range spec.BlockIO.WeightDevice { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight) + if err := setValue(path, "blkio.weight_device", val); err != nil { + return err + } + val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight) + if err := setValue(path, "blkio.leaf_weight_device", val); err != nil { + return err + } + } + if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil { + return err + } + if err := setThrottle(path, "blkio.throttle.write_bps_device", spec.BlockIO.ThrottleWriteBpsDevice); err != nil { + return err + } + if err := setThrottle(path, "blkio.throttle.read_iops_device", spec.BlockIO.ThrottleReadIOPSDevice); err != nil { + return err + } + return setThrottle(path, "blkio.throttle.write_iops_device", spec.BlockIO.ThrottleWriteIOPSDevice) +} + +func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error { + for _, dev := range devs { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate) + if err := setValue(path, name, val); err != nil { + return err + } + } + return nil +} + +type networkClass struct{} + +func (*networkClass) set(spec *specs.LinuxResources, path string) error { + if spec.Network == nil { + return nil + } + return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID) +} + +type networkPrio struct{} + +func (*networkPrio) set(spec *specs.LinuxResources, path string) error { + if spec.Network == nil { + return nil + } + for _, prio := range spec.Network.Priorities { + val := fmt.Sprintf("%s %d", prio.Name, prio.Priority) + if err := setValue(path, "net_prio.ifpriomap", val); err != nil { + return err + } + } + return nil +} diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go new file mode 100644 index 000000000..cde915329 --- /dev/null +++ b/runsc/cgroup/cgroup_test.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "testing" +) + +func TestCountCpuset(t *testing.T) { + for _, tc := range []struct { + str string + want int + error bool + }{ + {str: "0", want: 1}, + {str: "0,1,2,8,9,10", want: 6}, + {str: "0-1", want: 2}, + {str: "0-7", want: 8}, + {str: "0-7,16,32-39,64,65", want: 19}, + {str: "a", error: true}, + {str: "5-a", error: true}, + {str: "a-5", error: true}, + {str: "-10", error: true}, + {str: "15-", error: true}, + {str: "-", error: true}, + {str: "--", error: true}, + } { + t.Run(tc.str, func(t *testing.T) { + got, err := countCpuset(tc.str) + if tc.error { + if err == nil { + t.Errorf("countCpuset(%q) should have failed", tc.str) + } + } else { + if err != nil { + t.Errorf("countCpuset(%q) failed: %v", tc.str, err) + } + if tc.want != got { + t.Errorf("countCpuset(%q) want: %d, got: %d", tc.str, tc.want, got) + } + } + }) + } +} diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index c6f78f63f..d26e92bcd 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -59,6 +59,13 @@ type Boot struct { // applyCaps determines if capabilities defined in the spec should be applied // to the process. applyCaps bool + + // cpuNum number of CPUs to create inside the sandbox. + cpuNum int + + // totalMem sets the initial amount of total memory to report back to the + // container. + totalMem uint64 } // Name implements subcommands.Command.Name. @@ -86,6 +93,8 @@ func (b *Boot) SetFlags(f *flag.FlagSet) { f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls") f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") + f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") + f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") } // Execute implements subcommands.Command.Execute. It starts a sandbox in a @@ -143,7 +152,19 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } // Create the loader. - l, err := boot.New(f.Arg(0), spec, conf, b.controllerFD, b.deviceFD, b.ioFDs.GetArray(), b.stdioFDs.GetArray(), b.console) + bootArgs := boot.Args{ + ID: f.Arg(0), + Spec: spec, + Conf: conf, + ControllerFD: b.controllerFD, + DeviceFD: b.deviceFD, + GoferFDs: b.ioFDs.GetArray(), + StdioFDs: b.stdioFDs.GetArray(), + Console: b.console, + NumCPU: b.cpuNum, + TotalMem: b.totalMem, + } + l, err := boot.New(bootArgs) if err != nil { Fatalf("error creating loader: %v", err) } diff --git a/runsc/container/container.go b/runsc/container/container.go index f0cdee8d3..eaa62daf1 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -262,6 +262,8 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo Status: Creating, Owner: os.Getenv("USER"), } + cu := specutils.MakeCleanup(func() { c.Destroy() }) + defer cu.Clean() // If the metadata annotations indicate that this container should be // started in an existing sandbox, we must do so. The metadata will @@ -276,12 +278,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // Start a new sandbox for this container. Any errors after this point // must destroy the container. - s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles) + c.Sandbox, err = sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles) if err != nil { - c.Destroy() return nil, err } - c.Sandbox = s + if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil { + return nil, err + } } else { // This is sort of confusing. For a sandbox with a root // container and a child container in it, runsc sees: @@ -300,7 +303,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // Find the sandbox associated with this ID. sb, err := Load(conf.RootDir, sbid) if err != nil { - c.Destroy() return nil, err } c.Sandbox = sb.Sandbox @@ -309,7 +311,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // Save the metadata file. if err := c.save(); err != nil { - c.Destroy() return nil, err } @@ -317,11 +318,11 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // this file is created, so it must be the last thing we do. if pidFile != "" { if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil { - c.Destroy() return nil, fmt.Errorf("error writing PID file: %v", err) } } + cu.Release() return c, nil } @@ -358,6 +359,9 @@ func (c *Container) Start(conf *boot.Config) error { if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil { return err } + if err := c.Sandbox.AddGoferToCgroup(c.GoferPid); err != nil { + return err + } } // "If any poststart hook fails, the runtime MUST log a warning, but diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 09965dcc0..eb9c4cd76 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/platform/kvm", "//pkg/urpc", "//runsc/boot", + "//runsc/cgroup", "//runsc/console", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 847417a15..26d725bdd 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -34,6 +34,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" "gvisor.googlesource.com/gvisor/pkg/urpc" "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/cgroup" "gvisor.googlesource.com/gvisor/runsc/console" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -58,12 +59,26 @@ type Sandbox struct { // Chroot is the path to the chroot directory that the sandbox process // is running in. Chroot string `json:"chroot"` + + // Ccroup has the cgroup configuration for the sandbox. + Cgroup *cgroup.Cgroup `json:"cgroup"` } // Create creates the sandbox process. The caller must call Destroy() on the // sandbox. func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) { s := &Sandbox{ID: id} + c := specutils.MakeCleanup(func() { s.destroy() }) + defer c.Clean() + + if cg, ok := cgroup.New(spec); ok { + s.Cgroup = cg + + // If there is cgroup config, install it before creating sandbox process. + if err := s.Cgroup.Install(spec.Linux.Resources); err != nil { + return nil, fmt.Errorf("error configuring cgroup: %v", err) + } + } // Create the sandbox process. if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, ioFiles); err != nil { @@ -75,6 +90,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo return nil, err } + if s.Cgroup != nil { + if err := s.Cgroup.Add(s.Pid); err != nil { + return nil, fmt.Errorf("error adding sandbox to cgroup: %v", err) + } + } + + c.Release() return s, nil } @@ -483,6 +505,24 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund } } + if s.Cgroup != nil { + cpuNum, err := s.Cgroup.NumCPU() + if err != nil { + return fmt.Errorf("error getting cpu count from cgroups: %v", err) + } + cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) + + mem, err := s.Cgroup.MemoryLimit() + if err != nil { + return fmt.Errorf("error getting memory limit from cgroups: %v", err) + } + // When memory limit is unset, a "large" number is returned. In that case, + // just stick with the default. + if mem < 0x7ffffffffffff000 { + cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) + } + } + // Add container as the last argument. cmd.Args = append(cmd.Args, s.ID) @@ -590,8 +630,15 @@ func (s *Sandbox) destroy() error { } } + if s.Cgroup != nil { + if err := s.Cgroup.Uninstall(); err != nil { + return err + } + } if s.Chroot != "" { - return tearDownChroot(s.Chroot) + if err := tearDownChroot(s.Chroot); err != nil { + return err + } } return nil @@ -761,6 +808,14 @@ func (s *Sandbox) waitForStopped() error { return backoff.Retry(op, b) } +// AddGoferToCgroup adds the gofer process to the sandbox's cgroup. +func (s *Sandbox) AddGoferToCgroup(pid int) error { + if s.Cgroup != nil { + return s.Cgroup.Add(pid) + } + return nil +} + // deviceFileForPlatform opens the device file for the given platform. If the // platform does not need a device file, then nil is returned. func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) { diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index f1a99ce48..e73b2293f 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "specutils", srcs = [ - "cpu.go", "namespace.go", "specutils.go", ], diff --git a/runsc/specutils/cpu.go b/runsc/specutils/cpu.go deleted file mode 100644 index 9abe26b64..000000000 --- a/runsc/specutils/cpu.go +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2018 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package specutils - -import ( - "fmt" - "runtime" - "strconv" - "strings" - - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -// CalculateCPUNumber calculates the number of CPUs that should be exposed -// inside the sandbox. -func CalculateCPUNumber(spec *specs.Spec) (int, error) { - // If spec does not contain CPU field, then return the number of host CPUs. - if spec == nil || spec.Linux == nil || spec.Linux.Resources == nil || spec.Linux.Resources.CPU == nil { - return runtime.NumCPU(), nil - } - cpuSpec := spec.Linux.Resources.CPU - - // If cpuSpec.Cpus is specified, then parse and return that. They must be in - // the list format for cpusets, which is "a comma-separated list of CPU - // numbers and ranges of numbers, in ASCII decimal." --man 7 cpuset. - cpus := cpuSpec.Cpus - if cpus != "" { - cpuNum := 0 - for _, subs := range strings.Split(cpus, ",") { - result, err := parseCPUNumber(subs) - if err != nil { - return 0, err - } - cpuNum += result - } - return cpuNum, nil - } - - // If CPU.Quota and CPU.Period are specified, we can divide them to get an - // approximation of the number of CPUs needed. - if cpuSpec.Quota != nil && cpuSpec.Period != nil && *cpuSpec.Period != 0 { - cpuQuota := *cpuSpec.Quota - cpuPeriod := *cpuSpec.Period - return int(cpuQuota)/int(cpuPeriod) + 1, nil - } - - // Default to number of host cpus. - return runtime.NumCPU(), nil -} - -// parseCPUNumber converts a cpuset string into the number of cpus included in -// the string , e.g. "3-6" -> 4. -func parseCPUNumber(cpus string) (int, error) { - switch cpusSlice := strings.Split(cpus, "-"); len(cpusSlice) { - case 1: - // cpus is not a range. We must only check that it is a valid number. - if _, err := strconv.Atoi(cpus); err != nil { - return 0, fmt.Errorf("invalid individual cpu number %q", cpus) - } - return 1, nil - case 2: - // cpus is a range. We must check that start and end are valid numbers, - // and calculate their difference (inclusively). - first, err := strconv.Atoi(cpusSlice[0]) - if err != nil || first < 0 { - return 0, fmt.Errorf("invalid first cpu number %q in range %q", cpusSlice[0], cpus) - } - last, err := strconv.Atoi(cpusSlice[1]) - if err != nil || last < 0 { - return 0, fmt.Errorf("invalid last cpu number %q in range %q", cpusSlice[1], cpus) - } - cpuNum := last - first + 1 - if cpuNum <= 0 { - return 0, fmt.Errorf("cpu range %q does not include positive number of cpus", cpus) - } - } - return 0, fmt.Errorf("invalid cpu string %q", cpus) -} diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index daf10b875..ac017ba2d 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -43,6 +43,13 @@ func LogSpec(spec *specs.Spec) { log.Debugf("Spec: %+v", spec) log.Debugf("Spec.Hooks: %+v", spec.Hooks) log.Debugf("Spec.Linux: %+v", spec.Linux) + if spec.Linux != nil && spec.Linux.Resources != nil { + res := spec.Linux.Resources + log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory) + log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU) + log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO) + log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network) + } log.Debugf("Spec.Process: %+v", spec.Process) log.Debugf("Spec.Root: %+v", spec.Root) } @@ -402,3 +409,33 @@ func ContainsStr(strs []string, str string) bool { } return false } + +// Cleanup allows defers to be aborted when cleanup needs to happen +// conditionally. Usage: +// c := MakeCleanup(func() { f.Close() }) +// defer c.Clean() // any failure before release is called will close the file. +// ... +// c.Release() // on success, aborts closing the file and return it. +// return f +type Cleanup struct { + clean func() + released bool +} + +// MakeCleanup creates a new Cleanup object. +func MakeCleanup(f func()) Cleanup { + return Cleanup{clean: f} +} + +// Clean calls the cleanup function. +func (c *Cleanup) Clean() { + if !c.released { + c.clean() + } +} + +// Release releases the cleanup from its duties, i.e. cleanup function is not +// called after this point. +func (c *Cleanup) Release() { + c.released = true +} diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD index 4407016ad..726ebf49e 100644 --- a/runsc/test/integration/BUILD +++ b/runsc/test/integration/BUILD @@ -15,9 +15,7 @@ go_test( "manual", "local", ], - deps = [ - "//runsc/test/testutil", - ], + deps = ["//runsc/test/testutil"], ) go_library( diff --git a/runsc/test/integration/integration_test.go b/runsc/test/integration/integration_test.go index 5f24aeed5..5480c5bbe 100644 --- a/runsc/test/integration/integration_test.go +++ b/runsc/test/integration/integration_test.go @@ -26,6 +26,7 @@ import ( "net" "net/http" "os" + "strconv" "strings" "testing" "time" @@ -179,6 +180,89 @@ func TestConnectToSelf(t *testing.T) { } } +func TestMemLimit(t *testing.T) { + if err := testutil.Pull("alpine"); err != nil { + t.Fatal("docker pull failed:", err) + } + d := testutil.MakeDocker("cgroup-test") + cmd := "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'" + out, err := d.RunFg("--memory=500MB", "alpine", "sh", "-c", cmd) + if err != nil { + t.Fatal("docker run failed:", err) + } + defer d.CleanUp() + + // Remove warning message that swap isn't present. + if strings.HasPrefix(out, "WARNING") { + lines := strings.Split(out, "\n") + if len(lines) != 3 { + t.Fatalf("invalid output: %s", out) + } + out = lines[1] + } + + got, err := strconv.ParseUint(strings.TrimSpace(out), 10, 64) + if err != nil { + t.Fatalf("failed to parse %q: %v", out, err) + } + if want := uint64(500 * 1024); got != want { + t.Errorf("MemTotal got: %d, want: %d", got, want) + } +} + +func TestNumCPU(t *testing.T) { + if err := testutil.Pull("alpine"); err != nil { + t.Fatal("docker pull failed:", err) + } + d := testutil.MakeDocker("cgroup-test") + cmd := "cat /proc/cpuinfo | grep 'processor.*:' | wc -l" + out, err := d.RunFg("--cpuset-cpus=0", "alpine", "sh", "-c", cmd) + if err != nil { + t.Fatal("docker run failed:", err) + } + defer d.CleanUp() + + got, err := strconv.Atoi(strings.TrimSpace(out)) + if err != nil { + t.Fatalf("failed to parse %q: %v", out, err) + } + if want := 1; got != want { + t.Errorf("MemTotal got: %d, want: %d", got, want) + } +} + +// TestCgroup sets cgroup options and checks that container can start. +// TODO: Verify that these were set to cgroup on the host. +func TestCgroup(t *testing.T) { + if err := testutil.Pull("alpine"); err != nil { + t.Fatal("docker pull failed:", err) + } + d := testutil.MakeDocker("cgroup-test") + + var args []string + args = append(args, "--cpu-shares=1000") + args = append(args, "--cpu-period=2000") + args = append(args, "--cpu-quota=3000") + args = append(args, "--cpuset-cpus=0") + args = append(args, "--cpuset-mems=0") + args = append(args, "--kernel-memory=100MB") + args = append(args, "--memory=1GB") + args = append(args, "--memory-reservation=500MB") + args = append(args, "--memory-swap=2GB") + args = append(args, "--memory-swappiness=5") + args = append(args, "--blkio-weight=750") + + args = append(args, "hello-world") + if err := d.Run(args...); err != nil { + t.Fatal("docker create failed:", err) + } + defer d.CleanUp() + + if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil { + t.Fatalf("docker didn't say hello: %v", err) + } +} + func TestMain(m *testing.M) { testutil.EnsureSupportedDockerVersion() os.Exit(m.Run()) diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go index d70b4377a..2f15ab818 100644 --- a/runsc/test/testutil/docker.go +++ b/runsc/test/testutil/docker.go @@ -198,6 +198,15 @@ func (d *Docker) Run(args ...string) error { return err } +// RunFg calls 'docker run' with the arguments provided in the foreground. It +// blocks until the container exits and returns the output. +func (d *Docker) RunFg(args ...string) (string, error) { + a := []string{"run", "--runtime", d.Runtime, "--name", d.Name} + a = append(a, args...) + out, err := do(a...) + return string(out), err +} + // Logs calls 'docker logs'. func (d *Docker) Logs() (string, error) { return do("logs", d.Name) |