diff options
Diffstat (limited to 'runsc/specutils')
-rw-r--r-- | runsc/specutils/fs.go | 137 | ||||
-rw-r--r-- | runsc/specutils/namespace.go | 222 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 494 |
3 files changed, 853 insertions, 0 deletions
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go new file mode 100644 index 000000000..1f3afb4e4 --- /dev/null +++ b/runsc/specutils/fs.go @@ -0,0 +1,137 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + "fmt" + "path" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +type mapping struct { + set bool + val uint32 +} + +// optionsMap maps mount propagation-related OCI filesystem options to mount(2) +// syscall flags. +var optionsMap = map[string]mapping{ + "acl": {set: true, val: syscall.MS_POSIXACL}, + "async": {set: false, val: syscall.MS_SYNCHRONOUS}, + "atime": {set: false, val: syscall.MS_NOATIME}, + "bind": {set: true, val: syscall.MS_BIND}, + "defaults": {set: true, val: 0}, + "dev": {set: false, val: syscall.MS_NODEV}, + "diratime": {set: false, val: syscall.MS_NODIRATIME}, + "dirsync": {set: true, val: syscall.MS_DIRSYNC}, + "exec": {set: false, val: syscall.MS_NOEXEC}, + "noexec": {set: true, val: syscall.MS_NOEXEC}, + "iversion": {set: true, val: syscall.MS_I_VERSION}, + "loud": {set: false, val: syscall.MS_SILENT}, + "mand": {set: true, val: syscall.MS_MANDLOCK}, + "noacl": {set: false, val: syscall.MS_POSIXACL}, + "noatime": {set: true, val: syscall.MS_NOATIME}, + "nodev": {set: true, val: syscall.MS_NODEV}, + "nodiratime": {set: true, val: syscall.MS_NODIRATIME}, + "noiversion": {set: false, val: syscall.MS_I_VERSION}, + "nomand": {set: false, val: syscall.MS_MANDLOCK}, + "norelatime": {set: false, val: syscall.MS_RELATIME}, + "nostrictatime": {set: false, val: syscall.MS_STRICTATIME}, + "nosuid": {set: true, val: syscall.MS_NOSUID}, + "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC}, + "relatime": {set: true, val: syscall.MS_RELATIME}, + "remount": {set: true, val: syscall.MS_REMOUNT}, + "ro": {set: true, val: syscall.MS_RDONLY}, + "rw": {set: false, val: syscall.MS_RDONLY}, + "silent": {set: true, val: syscall.MS_SILENT}, + "strictatime": {set: true, val: syscall.MS_STRICTATIME}, + "suid": {set: false, val: syscall.MS_NOSUID}, + "sync": {set: true, val: syscall.MS_SYNCHRONOUS}, +} + +// propOptionsMap is similar to optionsMap, but it lists propagation options +// that cannot be used together with other flags. +var propOptionsMap = map[string]mapping{ + "private": {set: true, val: syscall.MS_PRIVATE}, + "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC}, + "slave": {set: true, val: syscall.MS_SLAVE}, + "rslave": {set: true, val: syscall.MS_SLAVE | syscall.MS_REC}, + "unbindable": {set: true, val: syscall.MS_UNBINDABLE}, + "runbindable": {set: true, val: syscall.MS_UNBINDABLE | syscall.MS_REC}, +} + +// invalidOptions list options not allowed. +// - shared: sandbox must be isolated from the host. Propagating mount changes +// from the sandbox to the host breaks the isolation. +var invalidOptions = []string{"shared", "rshared"} + +// OptionsToFlags converts mount options to syscall flags. +func OptionsToFlags(opts []string) uint32 { + return optionsToFlags(opts, optionsMap) +} + +// PropOptionsToFlags converts propagation mount options to syscall flags. +// Propagation options cannot be set other with other options and must be +// handled separatedly. +func PropOptionsToFlags(opts []string) uint32 { + return optionsToFlags(opts, propOptionsMap) +} + +func optionsToFlags(opts []string, source map[string]mapping) uint32 { + var rv uint32 + for _, opt := range opts { + if m, ok := source[opt]; ok { + if m.set { + rv |= m.val + } else { + rv ^= m.val + } + } + } + return rv +} + +// ValidateMount validates that spec mounts are correct. +func validateMount(mnt *specs.Mount) error { + if !path.IsAbs(mnt.Destination) { + return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt) + } + + if mnt.Type == "bind" { + for _, o := range mnt.Options { + if ContainsStr(invalidOptions, o) { + return fmt.Errorf("mount option %q is not supported: %v", o, mnt) + } + _, ok1 := optionsMap[o] + _, ok2 := propOptionsMap[o] + if !ok1 && !ok2 { + return fmt.Errorf("unknown mount option %q", o) + } + } + } + return nil +} + +// ValidateRootfsPropagation validates that rootfs propagation options are +// correct. +func validateRootfsPropagation(opt string) error { + flags := PropOptionsToFlags([]string{opt}) + if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 { + return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt) + } + return nil +} diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go new file mode 100644 index 000000000..7d194335c --- /dev/null +++ b/runsc/specutils/namespace.go @@ -0,0 +1,222 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// nsCloneFlag returns the clone flag that can be used to set a namespace of +// the given type. +func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr { + switch nst { + case specs.IPCNamespace: + return syscall.CLONE_NEWIPC + case specs.MountNamespace: + return syscall.CLONE_NEWNS + case specs.NetworkNamespace: + return syscall.CLONE_NEWNET + case specs.PIDNamespace: + return syscall.CLONE_NEWPID + case specs.UTSNamespace: + return syscall.CLONE_NEWUTS + case specs.UserNamespace: + return syscall.CLONE_NEWUSER + case specs.CgroupNamespace: + panic("cgroup namespace has no associated clone flag") + default: + panic(fmt.Sprintf("unknown namespace %v", nst)) + } +} + +// nsPath returns the path of the namespace for the current process and the +// given namespace. +func nsPath(nst specs.LinuxNamespaceType) string { + base := "/proc/self/ns" + switch nst { + case specs.CgroupNamespace: + return filepath.Join(base, "cgroup") + case specs.IPCNamespace: + return filepath.Join(base, "ipc") + case specs.MountNamespace: + return filepath.Join(base, "mnt") + case specs.NetworkNamespace: + return filepath.Join(base, "net") + case specs.PIDNamespace: + return filepath.Join(base, "pid") + case specs.UserNamespace: + return filepath.Join(base, "user") + case specs.UTSNamespace: + return filepath.Join(base, "uts") + default: + panic(fmt.Sprintf("unknown namespace %v", nst)) + } +} + +// GetNS returns true and the namespace with the given type from the slice of +// namespaces in the spec. It returns false if the slice does not contain a +// namespace with the type. +func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { + if s.Linux == nil { + return specs.LinuxNamespace{}, false + } + for _, ns := range s.Linux.Namespaces { + if ns.Type == nst { + return ns, true + } + } + return specs.LinuxNamespace{}, false +} + +// FilterNS returns a slice of namespaces from the spec with types that match +// those in the `filter` slice. +func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace { + if s.Linux == nil { + return nil + } + var out []specs.LinuxNamespace + for _, nst := range filter { + if ns, ok := GetNS(nst, s); ok { + out = append(out, ns) + } + } + return out +} + +// setNS sets the namespace of the given type. It must be called with +// OSThreadLocked. +func setNS(fd, nsType uintptr) error { + if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { + return err + } + return nil +} + +// ApplyNS applies the namespace on the current thread and returns a function +// that will restore the namespace to the original value. +// +// Preconditions: Must be called with os thread locked. +func ApplyNS(ns specs.LinuxNamespace) (func(), error) { + log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path) + newNS, err := os.Open(ns.Path) + if err != nil { + return nil, fmt.Errorf("error opening %q: %v", ns.Path, err) + } + defer newNS.Close() + + // Store current namespace to restore back. + curPath := nsPath(ns.Type) + oldNS, err := os.Open(curPath) + if err != nil { + return nil, fmt.Errorf("error opening %q: %v", curPath, err) + } + + // Set namespace to the one requested and setup function to restore it back. + flag := nsCloneFlag(ns.Type) + if err := setNS(newNS.Fd(), flag); err != nil { + oldNS.Close() + return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) + } + return func() { + log.Infof("Restoring namespace %v", ns.Type) + defer oldNS.Close() + if err := setNS(oldNS.Fd(), flag); err != nil { + panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err)) + } + }, nil +} + +// StartInNS joins or creates the given namespaces and calls cmd.Start before +// restoring the namespaces to the original values. +func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { + // We are about to setup namespaces, which requires the os thread being + // locked so that Go doesn't change the thread out from under us. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + + for _, ns := range nss { + if ns.Path == "" { + // No path. Just set a flag to create a new namespace. + cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type) + continue + } + // Join the given namespace, and restore the current namespace + // before exiting. + restoreNS, err := ApplyNS(ns) + if err != nil { + return err + } + defer restoreNS() + } + + return cmd.Start() +} + +// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. +func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { + if s.Linux == nil { + return + } + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + for _, idMap := range s.Linux.UIDMappings { + log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) + cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{ + ContainerID: int(idMap.ContainerID), + HostID: int(idMap.HostID), + Size: int(idMap.Size), + }) + } + for _, idMap := range s.Linux.GIDMappings { + log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) + cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{ + ContainerID: int(idMap.ContainerID), + HostID: int(idMap.HostID), + Size: int(idMap.Size), + }) + } +} + +// HasCapabilities returns true if the user has all capabilties in 'cs'. +func HasCapabilities(cs ...capability.Cap) bool { + caps, err := capability.NewPid2(os.Getpid()) + if err != nil { + return false + } + if err := caps.Load(); err != nil { + return false + } + for _, c := range cs { + if !caps.Get(capability.EFFECTIVE, c) { + return false + } + } + return true +} diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go new file mode 100644 index 000000000..2888f55db --- /dev/null +++ b/runsc/specutils/specutils.go @@ -0,0 +1,494 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package specutils contains utility functions for working with OCI runtime +// specs. +package specutils + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/cenkalti/backoff" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" +) + +// ExePath must point to runsc binary, which is normally the same binary. It's +// changed in tests that aren't linked in the same binary. +var ExePath = "/proc/self/exe" + +// Version is the supported spec version. +var Version = specs.Version + +// LogSpec logs the spec in a human-friendly way. +func LogSpec(spec *specs.Spec) { + log.Debugf("Spec: %+v", spec) + log.Debugf("Spec.Hooks: %+v", spec.Hooks) + log.Debugf("Spec.Linux: %+v", spec.Linux) + if spec.Linux != nil && spec.Linux.Resources != nil { + res := spec.Linux.Resources + log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory) + log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU) + log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO) + log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network) + } + log.Debugf("Spec.Process: %+v", spec.Process) + log.Debugf("Spec.Root: %+v", spec.Root) + log.Debugf("Spec.Mounts: %+v", spec.Mounts) +} + +// ValidateSpec validates that the spec is compatible with runsc. +func ValidateSpec(spec *specs.Spec) error { + // Mandatory fields. + if spec.Process == nil { + return fmt.Errorf("Spec.Process must be defined: %+v", spec) + } + if len(spec.Process.Args) == 0 { + return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process) + } + if spec.Root == nil { + return fmt.Errorf("Spec.Root must be defined: %+v", spec) + } + if len(spec.Root.Path) == 0 { + return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root) + } + + // Unsupported fields. + if spec.Solaris != nil { + return fmt.Errorf("Spec.Solaris is not supported: %+v", spec) + } + if spec.Windows != nil { + return fmt.Errorf("Spec.Windows is not supported: %+v", spec) + } + if len(spec.Process.SelinuxLabel) != 0 { + return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel) + } + + // Docker uses AppArmor by default, so just log that it's being ignored. + if spec.Process.ApparmorProfile != "" { + log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile) + } + + // TODO(b/72226747): Apply seccomp to application inside sandbox. + if spec.Linux != nil && spec.Linux.Seccomp != nil { + log.Warningf("Seccomp spec is being ignored") + } + + if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { + if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { + return err + } + } + for _, m := range spec.Mounts { + if err := validateMount(&m); err != nil { + return err + } + } + + // Two annotations are use by containerd to support multi-container pods. + // "io.kubernetes.cri.container-type" + // "io.kubernetes.cri.sandbox-id" + containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation] + _, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation] + switch { + // Non-containerd use won't set a container type. + case !hasContainerType: + case containerType == ContainerdContainerTypeSandbox: + // When starting a container in an existing sandbox, the sandbox ID + // must be set. + case containerType == ContainerdContainerTypeContainer: + if !hasSandboxID { + return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType) + } + default: + return fmt.Errorf("unknown container-type: %s", containerType) + } + + return nil +} + +// absPath turns the given path into an absolute path (if it is not already +// absolute) by prepending the base path. +func absPath(base, rel string) string { + if filepath.IsAbs(rel) { + return rel + } + return filepath.Join(base, rel) +} + +// OpenSpec opens an OCI runtime spec from the given bundle directory. +func OpenSpec(bundleDir string) (*os.File, error) { + // The spec file must be named "config.json" inside the bundle directory. + return os.Open(filepath.Join(bundleDir, "config.json")) +} + +// ReadSpec reads an OCI runtime spec from the given bundle directory. +// ReadSpec also normalizes all potential relative paths into absolute +// path, e.g. spec.Root.Path, mount.Source. +func ReadSpec(bundleDir string) (*specs.Spec, error) { + specFile, err := OpenSpec(bundleDir) + if err != nil { + return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err) + } + defer specFile.Close() + return ReadSpecFromFile(bundleDir, specFile) +} + +// ReadSpecFromFile reads an OCI runtime spec from the given File, and +// normalizes all relative paths into absolute by prepending the bundle dir. +func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) { + if _, err := specFile.Seek(0, os.SEEK_SET); err != nil { + return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) + } + specBytes, err := ioutil.ReadAll(specFile) + if err != nil { + return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err) + } + var spec specs.Spec + if err := json.Unmarshal(specBytes, &spec); err != nil { + return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes)) + } + if err := ValidateSpec(&spec); err != nil { + return nil, err + } + // Turn any relative paths in the spec to absolute by prepending the bundleDir. + spec.Root.Path = absPath(bundleDir, spec.Root.Path) + for i := range spec.Mounts { + m := &spec.Mounts[i] + if m.Source != "" { + m.Source = absPath(bundleDir, m.Source) + } + } + return &spec, nil +} + +// ReadMounts reads mount list from a file. +func ReadMounts(f *os.File) ([]specs.Mount, error) { + bytes, err := ioutil.ReadAll(f) + if err != nil { + return nil, fmt.Errorf("error reading mounts: %v", err) + } + var mounts []specs.Mount + if err := json.Unmarshal(bytes, &mounts); err != nil { + return nil, fmt.Errorf("error unmarshaling mounts: %v\n %s", err, string(bytes)) + } + return mounts, nil +} + +// Capabilities takes in spec and returns a TaskCapabilities corresponding to +// the spec. +func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { + // Strip CAP_NET_RAW from all capability sets if necessary. + skipSet := map[linux.Capability]struct{}{} + if !enableRaw { + skipSet[linux.CAP_NET_RAW] = struct{}{} + } + + var caps auth.TaskCapabilities + if specCaps != nil { + var err error + if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil { + return nil, err + } + if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil { + return nil, err + } + if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil { + return nil, err + } + if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil { + return nil, err + } + // TODO(nlacasse): Support ambient capabilities. + } + return &caps, nil +} + +// AllCapabilities returns a LinuxCapabilities struct with all capabilities. +func AllCapabilities() *specs.LinuxCapabilities { + var names []string + for n := range capFromName { + names = append(names, n) + } + return &specs.LinuxCapabilities{ + Bounding: names, + Effective: names, + Inheritable: names, + Permitted: names, + Ambient: names, + } +} + +var capFromName = map[string]linux.Capability{ + "CAP_CHOWN": linux.CAP_CHOWN, + "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE, + "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH, + "CAP_FOWNER": linux.CAP_FOWNER, + "CAP_FSETID": linux.CAP_FSETID, + "CAP_KILL": linux.CAP_KILL, + "CAP_SETGID": linux.CAP_SETGID, + "CAP_SETUID": linux.CAP_SETUID, + "CAP_SETPCAP": linux.CAP_SETPCAP, + "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE, + "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE, + "CAP_NET_BROADCAST": linux.CAP_NET_BROADCAST, + "CAP_NET_ADMIN": linux.CAP_NET_ADMIN, + "CAP_NET_RAW": linux.CAP_NET_RAW, + "CAP_IPC_LOCK": linux.CAP_IPC_LOCK, + "CAP_IPC_OWNER": linux.CAP_IPC_OWNER, + "CAP_SYS_MODULE": linux.CAP_SYS_MODULE, + "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO, + "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT, + "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE, + "CAP_SYS_PACCT": linux.CAP_SYS_PACCT, + "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN, + "CAP_SYS_BOOT": linux.CAP_SYS_BOOT, + "CAP_SYS_NICE": linux.CAP_SYS_NICE, + "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE, + "CAP_SYS_TIME": linux.CAP_SYS_TIME, + "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG, + "CAP_MKNOD": linux.CAP_MKNOD, + "CAP_LEASE": linux.CAP_LEASE, + "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE, + "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL, + "CAP_SETFCAP": linux.CAP_SETFCAP, + "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE, + "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN, + "CAP_SYSLOG": linux.CAP_SYSLOG, + "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM, + "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND, + "CAP_AUDIT_READ": linux.CAP_AUDIT_READ, +} + +func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) { + var caps []linux.Capability + for _, n := range names { + c, ok := capFromName[n] + if !ok { + return 0, fmt.Errorf("unknown capability %q", n) + } + // Should we skip this capabilty? + if _, ok := skipSet[c]; ok { + continue + } + caps = append(caps, c) + } + return auth.CapabilitySetOfMany(caps), nil +} + +// Is9PMount returns true if the given mount can be mounted as an external gofer. +func Is9PMount(m specs.Mount) bool { + return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m) +} + +// IsSupportedDevMount returns true if the mount is a supported /dev mount. +// Only mount that does not conflict with runsc default /dev mount is +// supported. +func IsSupportedDevMount(m specs.Mount) bool { + // These are devices exist inside sentry. See pkg/sentry/fs/dev/dev.go + var existingDevices = []string{ + "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr", + "/dev/null", "/dev/zero", "/dev/full", "/dev/random", + "/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx", + } + dst := filepath.Clean(m.Destination) + if dst == "/dev" { + // OCI spec uses many different mounts for the things inside of '/dev'. We + // have a single mount at '/dev' that is always mounted, regardless of + // whether it was asked for, as the spec says we SHOULD. + return false + } + for _, dev := range existingDevices { + if dst == dev || strings.HasPrefix(dst, dev+"/") { + return false + } + } + return true +} + +const ( + // ContainerdContainerTypeAnnotation is the OCI annotation set by + // containerd to indicate whether the container to create should have + // its own sandbox or a container within an existing sandbox. + ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type" + // ContainerdContainerTypeContainer is the container type value + // indicating the container should be created in an existing sandbox. + ContainerdContainerTypeContainer = "container" + // ContainerdContainerTypeSandbox is the container type value + // indicating the container should be created in a new sandbox. + ContainerdContainerTypeSandbox = "sandbox" + + // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate + // which sandbox the container should be created in when the container + // is not the first container in the sandbox. + ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id" +) + +// ShouldCreateSandbox returns true if the spec indicates that a new sandbox +// should be created for the container. If false, the container should be +// started in an existing sandbox. +func ShouldCreateSandbox(spec *specs.Spec) bool { + t, ok := spec.Annotations[ContainerdContainerTypeAnnotation] + return !ok || t == ContainerdContainerTypeSandbox +} + +// SandboxID returns the ID of the sandbox to join and whether an ID was found +// in the spec. +func SandboxID(spec *specs.Spec) (string, bool) { + id, ok := spec.Annotations[ContainerdSandboxIDAnnotation] + return id, ok +} + +// WaitForReady waits for a process to become ready. The process is ready when +// the 'ready' function returns true. It continues to wait if 'ready' returns +// false. It returns error on timeout, if the process stops or if 'ready' fails. +func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error { + b := backoff.NewExponentialBackOff() + b.InitialInterval = 1 * time.Millisecond + b.MaxInterval = 1 * time.Second + b.MaxElapsedTime = timeout + + op := func() error { + if ok, err := ready(); err != nil { + return backoff.Permanent(err) + } else if ok { + return nil + } + + // Check if the process is still running. + // If the process is alive, child is 0 because of the NOHANG option. + // If the process has terminated, child equals the process id. + var ws syscall.WaitStatus + var ru syscall.Rusage + child, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, &ru) + if err != nil { + return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err)) + } else if child == pid { + return backoff.Permanent(fmt.Errorf("process %d has terminated", pid)) + } + return fmt.Errorf("process %d not running yet", pid) + } + return backoff.Retry(op, b) +} + +// DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern' +// ends with '/', it's used as a directory with default file name. +// 'logPattern' can contain variables that are substitued: +// - %TIMESTAMP%: is replaced with a timestamp using the following format: +// <yyyymmdd-hhmmss.uuuuuu> +// - %COMMAND%: is replaced with 'command' +func DebugLogFile(logPattern, command string) (*os.File, error) { + if strings.HasSuffix(logPattern, "/") { + // Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command> + logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%" + } + logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1) + logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1) + + dir := filepath.Dir(logPattern) + if err := os.MkdirAll(dir, 0775); err != nil { + return nil, fmt.Errorf("error creating dir %q: %v", dir, err) + } + return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) +} + +// Mount creates the mount point and calls Mount with the given flags. +func Mount(src, dst, typ string, flags uint32) error { + // Create the mount point inside. The type must be the same as the + // source (file or directory). + var isDir bool + if typ == "proc" { + // Special case, as there is no source directory for proc mounts. + isDir = true + } else if fi, err := os.Stat(src); err != nil { + return fmt.Errorf("Stat(%q) failed: %v", src, err) + } else { + isDir = fi.IsDir() + } + + if isDir { + // Create the destination directory. + if err := os.MkdirAll(dst, 0777); err != nil { + return fmt.Errorf("Mkdir(%q) failed: %v", dst, err) + } + } else { + // Create the parent destination directory. + parent := path.Dir(dst) + if err := os.MkdirAll(parent, 0777); err != nil { + return fmt.Errorf("Mkdir(%q) failed: %v", parent, err) + } + // Create the destination file if it does not exist. + f, err := os.OpenFile(dst, syscall.O_CREAT, 0777) + if err != nil { + return fmt.Errorf("Open(%q) failed: %v", dst, err) + } + f.Close() + } + + // Do the mount. + if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil { + return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err) + } + return nil +} + +// ContainsStr returns true if 'str' is inside 'strs'. +func ContainsStr(strs []string, str string) bool { + for _, s := range strs { + if s == str { + return true + } + } + return false +} + +// Cleanup allows defers to be aborted when cleanup needs to happen +// conditionally. Usage: +// c := MakeCleanup(func() { f.Close() }) +// defer c.Clean() // any failure before release is called will close the file. +// ... +// c.Release() // on success, aborts closing the file and return it. +// return f +type Cleanup struct { + clean func() +} + +// MakeCleanup creates a new Cleanup object. +func MakeCleanup(f func()) Cleanup { + return Cleanup{clean: f} +} + +// Clean calls the cleanup function. +func (c *Cleanup) Clean() { + if c.clean != nil { + c.clean() + c.clean = nil + } +} + +// Release releases the cleanup from its duties, i.e. cleanup function is not +// called after this point. +func (c *Cleanup) Release() { + c.clean = nil +} |