summaryrefslogtreecommitdiffhomepage
path: root/runsc/specutils
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/specutils')
-rw-r--r--runsc/specutils/fs.go137
-rw-r--r--runsc/specutils/namespace.go222
-rw-r--r--runsc/specutils/specutils.go494
3 files changed, 853 insertions, 0 deletions
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
new file mode 100644
index 000000000..1f3afb4e4
--- /dev/null
+++ b/runsc/specutils/fs.go
@@ -0,0 +1,137 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+ "fmt"
+ "path"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+type mapping struct {
+ set bool
+ val uint32
+}
+
+// optionsMap maps mount propagation-related OCI filesystem options to mount(2)
+// syscall flags.
+var optionsMap = map[string]mapping{
+ "acl": {set: true, val: syscall.MS_POSIXACL},
+ "async": {set: false, val: syscall.MS_SYNCHRONOUS},
+ "atime": {set: false, val: syscall.MS_NOATIME},
+ "bind": {set: true, val: syscall.MS_BIND},
+ "defaults": {set: true, val: 0},
+ "dev": {set: false, val: syscall.MS_NODEV},
+ "diratime": {set: false, val: syscall.MS_NODIRATIME},
+ "dirsync": {set: true, val: syscall.MS_DIRSYNC},
+ "exec": {set: false, val: syscall.MS_NOEXEC},
+ "noexec": {set: true, val: syscall.MS_NOEXEC},
+ "iversion": {set: true, val: syscall.MS_I_VERSION},
+ "loud": {set: false, val: syscall.MS_SILENT},
+ "mand": {set: true, val: syscall.MS_MANDLOCK},
+ "noacl": {set: false, val: syscall.MS_POSIXACL},
+ "noatime": {set: true, val: syscall.MS_NOATIME},
+ "nodev": {set: true, val: syscall.MS_NODEV},
+ "nodiratime": {set: true, val: syscall.MS_NODIRATIME},
+ "noiversion": {set: false, val: syscall.MS_I_VERSION},
+ "nomand": {set: false, val: syscall.MS_MANDLOCK},
+ "norelatime": {set: false, val: syscall.MS_RELATIME},
+ "nostrictatime": {set: false, val: syscall.MS_STRICTATIME},
+ "nosuid": {set: true, val: syscall.MS_NOSUID},
+ "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC},
+ "relatime": {set: true, val: syscall.MS_RELATIME},
+ "remount": {set: true, val: syscall.MS_REMOUNT},
+ "ro": {set: true, val: syscall.MS_RDONLY},
+ "rw": {set: false, val: syscall.MS_RDONLY},
+ "silent": {set: true, val: syscall.MS_SILENT},
+ "strictatime": {set: true, val: syscall.MS_STRICTATIME},
+ "suid": {set: false, val: syscall.MS_NOSUID},
+ "sync": {set: true, val: syscall.MS_SYNCHRONOUS},
+}
+
+// propOptionsMap is similar to optionsMap, but it lists propagation options
+// that cannot be used together with other flags.
+var propOptionsMap = map[string]mapping{
+ "private": {set: true, val: syscall.MS_PRIVATE},
+ "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC},
+ "slave": {set: true, val: syscall.MS_SLAVE},
+ "rslave": {set: true, val: syscall.MS_SLAVE | syscall.MS_REC},
+ "unbindable": {set: true, val: syscall.MS_UNBINDABLE},
+ "runbindable": {set: true, val: syscall.MS_UNBINDABLE | syscall.MS_REC},
+}
+
+// invalidOptions list options not allowed.
+// - shared: sandbox must be isolated from the host. Propagating mount changes
+// from the sandbox to the host breaks the isolation.
+var invalidOptions = []string{"shared", "rshared"}
+
+// OptionsToFlags converts mount options to syscall flags.
+func OptionsToFlags(opts []string) uint32 {
+ return optionsToFlags(opts, optionsMap)
+}
+
+// PropOptionsToFlags converts propagation mount options to syscall flags.
+// Propagation options cannot be set other with other options and must be
+// handled separatedly.
+func PropOptionsToFlags(opts []string) uint32 {
+ return optionsToFlags(opts, propOptionsMap)
+}
+
+func optionsToFlags(opts []string, source map[string]mapping) uint32 {
+ var rv uint32
+ for _, opt := range opts {
+ if m, ok := source[opt]; ok {
+ if m.set {
+ rv |= m.val
+ } else {
+ rv ^= m.val
+ }
+ }
+ }
+ return rv
+}
+
+// ValidateMount validates that spec mounts are correct.
+func validateMount(mnt *specs.Mount) error {
+ if !path.IsAbs(mnt.Destination) {
+ return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
+ }
+
+ if mnt.Type == "bind" {
+ for _, o := range mnt.Options {
+ if ContainsStr(invalidOptions, o) {
+ return fmt.Errorf("mount option %q is not supported: %v", o, mnt)
+ }
+ _, ok1 := optionsMap[o]
+ _, ok2 := propOptionsMap[o]
+ if !ok1 && !ok2 {
+ return fmt.Errorf("unknown mount option %q", o)
+ }
+ }
+ }
+ return nil
+}
+
+// ValidateRootfsPropagation validates that rootfs propagation options are
+// correct.
+func validateRootfsPropagation(opt string) error {
+ flags := PropOptionsToFlags([]string{opt})
+ if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
+ return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
+ }
+ return nil
+}
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
new file mode 100644
index 000000000..7d194335c
--- /dev/null
+++ b/runsc/specutils/namespace.go
@@ -0,0 +1,222 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+ "fmt"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/syndtr/gocapability/capability"
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// nsCloneFlag returns the clone flag that can be used to set a namespace of
+// the given type.
+func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
+ switch nst {
+ case specs.IPCNamespace:
+ return syscall.CLONE_NEWIPC
+ case specs.MountNamespace:
+ return syscall.CLONE_NEWNS
+ case specs.NetworkNamespace:
+ return syscall.CLONE_NEWNET
+ case specs.PIDNamespace:
+ return syscall.CLONE_NEWPID
+ case specs.UTSNamespace:
+ return syscall.CLONE_NEWUTS
+ case specs.UserNamespace:
+ return syscall.CLONE_NEWUSER
+ case specs.CgroupNamespace:
+ panic("cgroup namespace has no associated clone flag")
+ default:
+ panic(fmt.Sprintf("unknown namespace %v", nst))
+ }
+}
+
+// nsPath returns the path of the namespace for the current process and the
+// given namespace.
+func nsPath(nst specs.LinuxNamespaceType) string {
+ base := "/proc/self/ns"
+ switch nst {
+ case specs.CgroupNamespace:
+ return filepath.Join(base, "cgroup")
+ case specs.IPCNamespace:
+ return filepath.Join(base, "ipc")
+ case specs.MountNamespace:
+ return filepath.Join(base, "mnt")
+ case specs.NetworkNamespace:
+ return filepath.Join(base, "net")
+ case specs.PIDNamespace:
+ return filepath.Join(base, "pid")
+ case specs.UserNamespace:
+ return filepath.Join(base, "user")
+ case specs.UTSNamespace:
+ return filepath.Join(base, "uts")
+ default:
+ panic(fmt.Sprintf("unknown namespace %v", nst))
+ }
+}
+
+// GetNS returns true and the namespace with the given type from the slice of
+// namespaces in the spec. It returns false if the slice does not contain a
+// namespace with the type.
+func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
+ if s.Linux == nil {
+ return specs.LinuxNamespace{}, false
+ }
+ for _, ns := range s.Linux.Namespaces {
+ if ns.Type == nst {
+ return ns, true
+ }
+ }
+ return specs.LinuxNamespace{}, false
+}
+
+// FilterNS returns a slice of namespaces from the spec with types that match
+// those in the `filter` slice.
+func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
+ if s.Linux == nil {
+ return nil
+ }
+ var out []specs.LinuxNamespace
+ for _, nst := range filter {
+ if ns, ok := GetNS(nst, s); ok {
+ out = append(out, ns)
+ }
+ }
+ return out
+}
+
+// setNS sets the namespace of the given type. It must be called with
+// OSThreadLocked.
+func setNS(fd, nsType uintptr) error {
+ if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
+ return err
+ }
+ return nil
+}
+
+// ApplyNS applies the namespace on the current thread and returns a function
+// that will restore the namespace to the original value.
+//
+// Preconditions: Must be called with os thread locked.
+func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
+ log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
+ newNS, err := os.Open(ns.Path)
+ if err != nil {
+ return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
+ }
+ defer newNS.Close()
+
+ // Store current namespace to restore back.
+ curPath := nsPath(ns.Type)
+ oldNS, err := os.Open(curPath)
+ if err != nil {
+ return nil, fmt.Errorf("error opening %q: %v", curPath, err)
+ }
+
+ // Set namespace to the one requested and setup function to restore it back.
+ flag := nsCloneFlag(ns.Type)
+ if err := setNS(newNS.Fd(), flag); err != nil {
+ oldNS.Close()
+ return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
+ }
+ return func() {
+ log.Infof("Restoring namespace %v", ns.Type)
+ defer oldNS.Close()
+ if err := setNS(oldNS.Fd(), flag); err != nil {
+ panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
+ }
+ }, nil
+}
+
+// StartInNS joins or creates the given namespaces and calls cmd.Start before
+// restoring the namespaces to the original values.
+func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
+ // We are about to setup namespaces, which requires the os thread being
+ // locked so that Go doesn't change the thread out from under us.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+
+ for _, ns := range nss {
+ if ns.Path == "" {
+ // No path. Just set a flag to create a new namespace.
+ cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
+ continue
+ }
+ // Join the given namespace, and restore the current namespace
+ // before exiting.
+ restoreNS, err := ApplyNS(ns)
+ if err != nil {
+ return err
+ }
+ defer restoreNS()
+ }
+
+ return cmd.Start()
+}
+
+// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
+func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
+ if s.Linux == nil {
+ return
+ }
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+ for _, idMap := range s.Linux.UIDMappings {
+ log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+ cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
+ ContainerID: int(idMap.ContainerID),
+ HostID: int(idMap.HostID),
+ Size: int(idMap.Size),
+ })
+ }
+ for _, idMap := range s.Linux.GIDMappings {
+ log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+ cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
+ ContainerID: int(idMap.ContainerID),
+ HostID: int(idMap.HostID),
+ Size: int(idMap.Size),
+ })
+ }
+}
+
+// HasCapabilities returns true if the user has all capabilties in 'cs'.
+func HasCapabilities(cs ...capability.Cap) bool {
+ caps, err := capability.NewPid2(os.Getpid())
+ if err != nil {
+ return false
+ }
+ if err := caps.Load(); err != nil {
+ return false
+ }
+ for _, c := range cs {
+ if !caps.Get(capability.EFFECTIVE, c) {
+ return false
+ }
+ }
+ return true
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
new file mode 100644
index 000000000..2888f55db
--- /dev/null
+++ b/runsc/specutils/specutils.go
@@ -0,0 +1,494 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package specutils contains utility functions for working with OCI runtime
+// specs.
+package specutils
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path"
+ "path/filepath"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/cenkalti/backoff"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// ExePath must point to runsc binary, which is normally the same binary. It's
+// changed in tests that aren't linked in the same binary.
+var ExePath = "/proc/self/exe"
+
+// Version is the supported spec version.
+var Version = specs.Version
+
+// LogSpec logs the spec in a human-friendly way.
+func LogSpec(spec *specs.Spec) {
+ log.Debugf("Spec: %+v", spec)
+ log.Debugf("Spec.Hooks: %+v", spec.Hooks)
+ log.Debugf("Spec.Linux: %+v", spec.Linux)
+ if spec.Linux != nil && spec.Linux.Resources != nil {
+ res := spec.Linux.Resources
+ log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory)
+ log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU)
+ log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO)
+ log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network)
+ }
+ log.Debugf("Spec.Process: %+v", spec.Process)
+ log.Debugf("Spec.Root: %+v", spec.Root)
+ log.Debugf("Spec.Mounts: %+v", spec.Mounts)
+}
+
+// ValidateSpec validates that the spec is compatible with runsc.
+func ValidateSpec(spec *specs.Spec) error {
+ // Mandatory fields.
+ if spec.Process == nil {
+ return fmt.Errorf("Spec.Process must be defined: %+v", spec)
+ }
+ if len(spec.Process.Args) == 0 {
+ return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process)
+ }
+ if spec.Root == nil {
+ return fmt.Errorf("Spec.Root must be defined: %+v", spec)
+ }
+ if len(spec.Root.Path) == 0 {
+ return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root)
+ }
+
+ // Unsupported fields.
+ if spec.Solaris != nil {
+ return fmt.Errorf("Spec.Solaris is not supported: %+v", spec)
+ }
+ if spec.Windows != nil {
+ return fmt.Errorf("Spec.Windows is not supported: %+v", spec)
+ }
+ if len(spec.Process.SelinuxLabel) != 0 {
+ return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
+ }
+
+ // Docker uses AppArmor by default, so just log that it's being ignored.
+ if spec.Process.ApparmorProfile != "" {
+ log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
+ }
+
+ // TODO(b/72226747): Apply seccomp to application inside sandbox.
+ if spec.Linux != nil && spec.Linux.Seccomp != nil {
+ log.Warningf("Seccomp spec is being ignored")
+ }
+
+ if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
+ if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
+ return err
+ }
+ }
+ for _, m := range spec.Mounts {
+ if err := validateMount(&m); err != nil {
+ return err
+ }
+ }
+
+ // Two annotations are use by containerd to support multi-container pods.
+ // "io.kubernetes.cri.container-type"
+ // "io.kubernetes.cri.sandbox-id"
+ containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation]
+ _, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation]
+ switch {
+ // Non-containerd use won't set a container type.
+ case !hasContainerType:
+ case containerType == ContainerdContainerTypeSandbox:
+ // When starting a container in an existing sandbox, the sandbox ID
+ // must be set.
+ case containerType == ContainerdContainerTypeContainer:
+ if !hasSandboxID {
+ return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType)
+ }
+ default:
+ return fmt.Errorf("unknown container-type: %s", containerType)
+ }
+
+ return nil
+}
+
+// absPath turns the given path into an absolute path (if it is not already
+// absolute) by prepending the base path.
+func absPath(base, rel string) string {
+ if filepath.IsAbs(rel) {
+ return rel
+ }
+ return filepath.Join(base, rel)
+}
+
+// OpenSpec opens an OCI runtime spec from the given bundle directory.
+func OpenSpec(bundleDir string) (*os.File, error) {
+ // The spec file must be named "config.json" inside the bundle directory.
+ return os.Open(filepath.Join(bundleDir, "config.json"))
+}
+
+// ReadSpec reads an OCI runtime spec from the given bundle directory.
+// ReadSpec also normalizes all potential relative paths into absolute
+// path, e.g. spec.Root.Path, mount.Source.
+func ReadSpec(bundleDir string) (*specs.Spec, error) {
+ specFile, err := OpenSpec(bundleDir)
+ if err != nil {
+ return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
+ }
+ defer specFile.Close()
+ return ReadSpecFromFile(bundleDir, specFile)
+}
+
+// ReadSpecFromFile reads an OCI runtime spec from the given File, and
+// normalizes all relative paths into absolute by prepending the bundle dir.
+func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) {
+ if _, err := specFile.Seek(0, os.SEEK_SET); err != nil {
+ return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
+ }
+ specBytes, err := ioutil.ReadAll(specFile)
+ if err != nil {
+ return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err)
+ }
+ var spec specs.Spec
+ if err := json.Unmarshal(specBytes, &spec); err != nil {
+ return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes))
+ }
+ if err := ValidateSpec(&spec); err != nil {
+ return nil, err
+ }
+ // Turn any relative paths in the spec to absolute by prepending the bundleDir.
+ spec.Root.Path = absPath(bundleDir, spec.Root.Path)
+ for i := range spec.Mounts {
+ m := &spec.Mounts[i]
+ if m.Source != "" {
+ m.Source = absPath(bundleDir, m.Source)
+ }
+ }
+ return &spec, nil
+}
+
+// ReadMounts reads mount list from a file.
+func ReadMounts(f *os.File) ([]specs.Mount, error) {
+ bytes, err := ioutil.ReadAll(f)
+ if err != nil {
+ return nil, fmt.Errorf("error reading mounts: %v", err)
+ }
+ var mounts []specs.Mount
+ if err := json.Unmarshal(bytes, &mounts); err != nil {
+ return nil, fmt.Errorf("error unmarshaling mounts: %v\n %s", err, string(bytes))
+ }
+ return mounts, nil
+}
+
+// Capabilities takes in spec and returns a TaskCapabilities corresponding to
+// the spec.
+func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+ // Strip CAP_NET_RAW from all capability sets if necessary.
+ skipSet := map[linux.Capability]struct{}{}
+ if !enableRaw {
+ skipSet[linux.CAP_NET_RAW] = struct{}{}
+ }
+
+ var caps auth.TaskCapabilities
+ if specCaps != nil {
+ var err error
+ if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil {
+ return nil, err
+ }
+ if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil {
+ return nil, err
+ }
+ if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil {
+ return nil, err
+ }
+ if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
+ return nil, err
+ }
+ // TODO(nlacasse): Support ambient capabilities.
+ }
+ return &caps, nil
+}
+
+// AllCapabilities returns a LinuxCapabilities struct with all capabilities.
+func AllCapabilities() *specs.LinuxCapabilities {
+ var names []string
+ for n := range capFromName {
+ names = append(names, n)
+ }
+ return &specs.LinuxCapabilities{
+ Bounding: names,
+ Effective: names,
+ Inheritable: names,
+ Permitted: names,
+ Ambient: names,
+ }
+}
+
+var capFromName = map[string]linux.Capability{
+ "CAP_CHOWN": linux.CAP_CHOWN,
+ "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE,
+ "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH,
+ "CAP_FOWNER": linux.CAP_FOWNER,
+ "CAP_FSETID": linux.CAP_FSETID,
+ "CAP_KILL": linux.CAP_KILL,
+ "CAP_SETGID": linux.CAP_SETGID,
+ "CAP_SETUID": linux.CAP_SETUID,
+ "CAP_SETPCAP": linux.CAP_SETPCAP,
+ "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE,
+ "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE,
+ "CAP_NET_BROADCAST": linux.CAP_NET_BROADCAST,
+ "CAP_NET_ADMIN": linux.CAP_NET_ADMIN,
+ "CAP_NET_RAW": linux.CAP_NET_RAW,
+ "CAP_IPC_LOCK": linux.CAP_IPC_LOCK,
+ "CAP_IPC_OWNER": linux.CAP_IPC_OWNER,
+ "CAP_SYS_MODULE": linux.CAP_SYS_MODULE,
+ "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO,
+ "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT,
+ "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE,
+ "CAP_SYS_PACCT": linux.CAP_SYS_PACCT,
+ "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN,
+ "CAP_SYS_BOOT": linux.CAP_SYS_BOOT,
+ "CAP_SYS_NICE": linux.CAP_SYS_NICE,
+ "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE,
+ "CAP_SYS_TIME": linux.CAP_SYS_TIME,
+ "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG,
+ "CAP_MKNOD": linux.CAP_MKNOD,
+ "CAP_LEASE": linux.CAP_LEASE,
+ "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE,
+ "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL,
+ "CAP_SETFCAP": linux.CAP_SETFCAP,
+ "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE,
+ "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN,
+ "CAP_SYSLOG": linux.CAP_SYSLOG,
+ "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM,
+ "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND,
+ "CAP_AUDIT_READ": linux.CAP_AUDIT_READ,
+}
+
+func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) {
+ var caps []linux.Capability
+ for _, n := range names {
+ c, ok := capFromName[n]
+ if !ok {
+ return 0, fmt.Errorf("unknown capability %q", n)
+ }
+ // Should we skip this capabilty?
+ if _, ok := skipSet[c]; ok {
+ continue
+ }
+ caps = append(caps, c)
+ }
+ return auth.CapabilitySetOfMany(caps), nil
+}
+
+// Is9PMount returns true if the given mount can be mounted as an external gofer.
+func Is9PMount(m specs.Mount) bool {
+ return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m)
+}
+
+// IsSupportedDevMount returns true if the mount is a supported /dev mount.
+// Only mount that does not conflict with runsc default /dev mount is
+// supported.
+func IsSupportedDevMount(m specs.Mount) bool {
+ // These are devices exist inside sentry. See pkg/sentry/fs/dev/dev.go
+ var existingDevices = []string{
+ "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr",
+ "/dev/null", "/dev/zero", "/dev/full", "/dev/random",
+ "/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx",
+ }
+ dst := filepath.Clean(m.Destination)
+ if dst == "/dev" {
+ // OCI spec uses many different mounts for the things inside of '/dev'. We
+ // have a single mount at '/dev' that is always mounted, regardless of
+ // whether it was asked for, as the spec says we SHOULD.
+ return false
+ }
+ for _, dev := range existingDevices {
+ if dst == dev || strings.HasPrefix(dst, dev+"/") {
+ return false
+ }
+ }
+ return true
+}
+
+const (
+ // ContainerdContainerTypeAnnotation is the OCI annotation set by
+ // containerd to indicate whether the container to create should have
+ // its own sandbox or a container within an existing sandbox.
+ ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+ // ContainerdContainerTypeContainer is the container type value
+ // indicating the container should be created in an existing sandbox.
+ ContainerdContainerTypeContainer = "container"
+ // ContainerdContainerTypeSandbox is the container type value
+ // indicating the container should be created in a new sandbox.
+ ContainerdContainerTypeSandbox = "sandbox"
+
+ // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
+ // which sandbox the container should be created in when the container
+ // is not the first container in the sandbox.
+ ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
+)
+
+// ShouldCreateSandbox returns true if the spec indicates that a new sandbox
+// should be created for the container. If false, the container should be
+// started in an existing sandbox.
+func ShouldCreateSandbox(spec *specs.Spec) bool {
+ t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]
+ return !ok || t == ContainerdContainerTypeSandbox
+}
+
+// SandboxID returns the ID of the sandbox to join and whether an ID was found
+// in the spec.
+func SandboxID(spec *specs.Spec) (string, bool) {
+ id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]
+ return id, ok
+}
+
+// WaitForReady waits for a process to become ready. The process is ready when
+// the 'ready' function returns true. It continues to wait if 'ready' returns
+// false. It returns error on timeout, if the process stops or if 'ready' fails.
+func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error {
+ b := backoff.NewExponentialBackOff()
+ b.InitialInterval = 1 * time.Millisecond
+ b.MaxInterval = 1 * time.Second
+ b.MaxElapsedTime = timeout
+
+ op := func() error {
+ if ok, err := ready(); err != nil {
+ return backoff.Permanent(err)
+ } else if ok {
+ return nil
+ }
+
+ // Check if the process is still running.
+ // If the process is alive, child is 0 because of the NOHANG option.
+ // If the process has terminated, child equals the process id.
+ var ws syscall.WaitStatus
+ var ru syscall.Rusage
+ child, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, &ru)
+ if err != nil {
+ return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err))
+ } else if child == pid {
+ return backoff.Permanent(fmt.Errorf("process %d has terminated", pid))
+ }
+ return fmt.Errorf("process %d not running yet", pid)
+ }
+ return backoff.Retry(op, b)
+}
+
+// DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern'
+// ends with '/', it's used as a directory with default file name.
+// 'logPattern' can contain variables that are substitued:
+// - %TIMESTAMP%: is replaced with a timestamp using the following format:
+// <yyyymmdd-hhmmss.uuuuuu>
+// - %COMMAND%: is replaced with 'command'
+func DebugLogFile(logPattern, command string) (*os.File, error) {
+ if strings.HasSuffix(logPattern, "/") {
+ // Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
+ logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%"
+ }
+ logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1)
+ logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1)
+
+ dir := filepath.Dir(logPattern)
+ if err := os.MkdirAll(dir, 0775); err != nil {
+ return nil, fmt.Errorf("error creating dir %q: %v", dir, err)
+ }
+ return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+}
+
+// Mount creates the mount point and calls Mount with the given flags.
+func Mount(src, dst, typ string, flags uint32) error {
+ // Create the mount point inside. The type must be the same as the
+ // source (file or directory).
+ var isDir bool
+ if typ == "proc" {
+ // Special case, as there is no source directory for proc mounts.
+ isDir = true
+ } else if fi, err := os.Stat(src); err != nil {
+ return fmt.Errorf("Stat(%q) failed: %v", src, err)
+ } else {
+ isDir = fi.IsDir()
+ }
+
+ if isDir {
+ // Create the destination directory.
+ if err := os.MkdirAll(dst, 0777); err != nil {
+ return fmt.Errorf("Mkdir(%q) failed: %v", dst, err)
+ }
+ } else {
+ // Create the parent destination directory.
+ parent := path.Dir(dst)
+ if err := os.MkdirAll(parent, 0777); err != nil {
+ return fmt.Errorf("Mkdir(%q) failed: %v", parent, err)
+ }
+ // Create the destination file if it does not exist.
+ f, err := os.OpenFile(dst, syscall.O_CREAT, 0777)
+ if err != nil {
+ return fmt.Errorf("Open(%q) failed: %v", dst, err)
+ }
+ f.Close()
+ }
+
+ // Do the mount.
+ if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil {
+ return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err)
+ }
+ return nil
+}
+
+// ContainsStr returns true if 'str' is inside 'strs'.
+func ContainsStr(strs []string, str string) bool {
+ for _, s := range strs {
+ if s == str {
+ return true
+ }
+ }
+ return false
+}
+
+// Cleanup allows defers to be aborted when cleanup needs to happen
+// conditionally. Usage:
+// c := MakeCleanup(func() { f.Close() })
+// defer c.Clean() // any failure before release is called will close the file.
+// ...
+// c.Release() // on success, aborts closing the file and return it.
+// return f
+type Cleanup struct {
+ clean func()
+}
+
+// MakeCleanup creates a new Cleanup object.
+func MakeCleanup(f func()) Cleanup {
+ return Cleanup{clean: f}
+}
+
+// Clean calls the cleanup function.
+func (c *Cleanup) Clean() {
+ if c.clean != nil {
+ c.clean()
+ c.clean = nil
+ }
+}
+
+// Release releases the cleanup from its duties, i.e. cleanup function is not
+// called after this point.
+func (c *Cleanup) Release() {
+ c.clean = nil
+}