summaryrefslogtreecommitdiffhomepage
path: root/runsc/specutils/namespace.go
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/specutils/namespace.go')
-rw-r--r--runsc/specutils/namespace.go277
1 files changed, 277 insertions, 0 deletions
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
new file mode 100644
index 000000000..60bb7b7ee
--- /dev/null
+++ b/runsc/specutils/namespace.go
@@ -0,0 +1,277 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+ "fmt"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/syndtr/gocapability/capability"
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/log"
+)
+
+// nsCloneFlag returns the clone flag that can be used to set a namespace of
+// the given type.
+func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
+ switch nst {
+ case specs.IPCNamespace:
+ return unix.CLONE_NEWIPC
+ case specs.MountNamespace:
+ return unix.CLONE_NEWNS
+ case specs.NetworkNamespace:
+ return unix.CLONE_NEWNET
+ case specs.PIDNamespace:
+ return unix.CLONE_NEWPID
+ case specs.UTSNamespace:
+ return unix.CLONE_NEWUTS
+ case specs.UserNamespace:
+ return unix.CLONE_NEWUSER
+ case specs.CgroupNamespace:
+ return unix.CLONE_NEWCGROUP
+ default:
+ panic(fmt.Sprintf("unknown namespace %v", nst))
+ }
+}
+
+// nsPath returns the path of the namespace for the current process and the
+// given namespace.
+func nsPath(nst specs.LinuxNamespaceType) string {
+ base := "/proc/self/ns"
+ switch nst {
+ case specs.CgroupNamespace:
+ return filepath.Join(base, "cgroup")
+ case specs.IPCNamespace:
+ return filepath.Join(base, "ipc")
+ case specs.MountNamespace:
+ return filepath.Join(base, "mnt")
+ case specs.NetworkNamespace:
+ return filepath.Join(base, "net")
+ case specs.PIDNamespace:
+ return filepath.Join(base, "pid")
+ case specs.UserNamespace:
+ return filepath.Join(base, "user")
+ case specs.UTSNamespace:
+ return filepath.Join(base, "uts")
+ default:
+ panic(fmt.Sprintf("unknown namespace %v", nst))
+ }
+}
+
+// GetNS returns true and the namespace with the given type from the slice of
+// namespaces in the spec. It returns false if the slice does not contain a
+// namespace with the type.
+func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
+ if s.Linux == nil {
+ return specs.LinuxNamespace{}, false
+ }
+ for _, ns := range s.Linux.Namespaces {
+ if ns.Type == nst {
+ return ns, true
+ }
+ }
+ return specs.LinuxNamespace{}, false
+}
+
+// FilterNS returns a slice of namespaces from the spec with types that match
+// those in the `filter` slice.
+func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
+ if s.Linux == nil {
+ return nil
+ }
+ var out []specs.LinuxNamespace
+ for _, nst := range filter {
+ if ns, ok := GetNS(nst, s); ok {
+ out = append(out, ns)
+ }
+ }
+ return out
+}
+
+// setNS sets the namespace of the given type. It must be called with
+// OSThreadLocked.
+func setNS(fd, nsType uintptr) error {
+ if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
+ return err
+ }
+ return nil
+}
+
+// ApplyNS applies the namespace on the current thread and returns a function
+// that will restore the namespace to the original value.
+//
+// Preconditions: Must be called with os thread locked.
+func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
+ log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
+ newNS, err := os.Open(ns.Path)
+ if err != nil {
+ return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
+ }
+ defer newNS.Close()
+
+ // Store current namespace to restore back.
+ curPath := nsPath(ns.Type)
+ oldNS, err := os.Open(curPath)
+ if err != nil {
+ return nil, fmt.Errorf("error opening %q: %v", curPath, err)
+ }
+
+ // Set namespace to the one requested and setup function to restore it back.
+ flag := nsCloneFlag(ns.Type)
+ if err := setNS(newNS.Fd(), flag); err != nil {
+ oldNS.Close()
+ return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
+ }
+ return func() {
+ log.Infof("Restoring namespace %v", ns.Type)
+ defer oldNS.Close()
+ if err := setNS(oldNS.Fd(), flag); err != nil {
+ panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
+ }
+ }, nil
+}
+
+// StartInNS joins or creates the given namespaces and calls cmd.Start before
+// restoring the namespaces to the original values.
+func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
+ // We are about to setup namespaces, which requires the os thread being
+ // locked so that Go doesn't change the thread out from under us.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+
+ for _, ns := range nss {
+ if ns.Path == "" {
+ // No path. Just set a flag to create a new namespace.
+ cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
+ continue
+ }
+ // Join the given namespace, and restore the current namespace
+ // before exiting.
+ restoreNS, err := ApplyNS(ns)
+ if err != nil {
+ return err
+ }
+ defer restoreNS()
+ }
+
+ return cmd.Start()
+}
+
+// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
+func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
+ if s.Linux == nil {
+ return
+ }
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+ for _, idMap := range s.Linux.UIDMappings {
+ log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+ cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
+ ContainerID: int(idMap.ContainerID),
+ HostID: int(idMap.HostID),
+ Size: int(idMap.Size),
+ })
+ }
+ for _, idMap := range s.Linux.GIDMappings {
+ log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+ cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
+ ContainerID: int(idMap.ContainerID),
+ HostID: int(idMap.HostID),
+ Size: int(idMap.Size),
+ })
+ }
+}
+
+// HasCapabilities returns true if the user has all capabilities in 'cs'.
+func HasCapabilities(cs ...capability.Cap) bool {
+ caps, err := capability.NewPid2(os.Getpid())
+ if err != nil {
+ return false
+ }
+ if err := caps.Load(); err != nil {
+ return false
+ }
+ for _, c := range cs {
+ if !caps.Get(capability.EFFECTIVE, c) {
+ return false
+ }
+ }
+ return true
+}
+
+// MaybeRunAsRoot ensures the process runs with capabilities needed to create a
+// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
+// it will create a new user namespace and re-execute the process as root
+// inside the namespace with the same arguments and environment.
+//
+// This function returns immediately when no new capability is needed. If
+// another process is executed, it returns straight from here with the same exit
+// code as the child.
+func MaybeRunAsRoot() error {
+ if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
+ return nil
+ }
+
+ // Current process doesn't have required capabilities, create user namespace
+ // and run as root inside the namespace to acquire capabilities.
+ log.Infof("*** Re-running as root in new user namespace ***")
+
+ cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
+
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+ // Set current user/group as root inside the namespace. Since we may not
+ // have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
+ UidMappings: []syscall.SysProcIDMap{
+ {ContainerID: 0, HostID: os.Getuid(), Size: 1},
+ },
+ GidMappings: []syscall.SysProcIDMap{
+ {ContainerID: 0, HostID: os.Getgid(), Size: 1},
+ },
+ Credential: &syscall.Credential{Uid: 0, Gid: 0},
+ GidMappingsEnableSetgroups: false,
+
+ // Make sure child is killed when the parent terminates.
+ Pdeathsig: syscall.SIGKILL,
+ }
+
+ cmd.Env = os.Environ()
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ if err := cmd.Run(); err != nil {
+ if exit, ok := err.(*exec.ExitError); ok {
+ if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
+ os.Exit(ws.ExitStatus())
+ }
+ log.Warningf("No wait status provided, exiting with -1: %v", err)
+ os.Exit(-1)
+ }
+ return fmt.Errorf("re-executing self: %v", err)
+ }
+ // Child completed with success.
+ os.Exit(0)
+ panic("unreachable")
+}