3 files changed, 82 insertions, 15 deletions
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 15476de6f..0456e4c4f 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -10,10 +10,7 @@ go_library(
         "specutils.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
-    visibility = [
-        "//runsc:__subpackages__",
-        "//test:__subpackages__",
-    ],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index 1f3afb4e4..6e6902e9f 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -16,6 +16,7 @@ package specutils
 
 import (
 	"fmt"
+	"math/bits"
 	"path"
 	"syscall"
 
@@ -105,22 +106,30 @@ func optionsToFlags(opts []string, source map[string]mapping) uint32 {
 	return rv
 }
 
-// ValidateMount validates that spec mounts are correct.
+// validateMount validates that spec mounts are correct.
 func validateMount(mnt *specs.Mount) error {
 	if !path.IsAbs(mnt.Destination) {
 		return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
 	}
-
 	if mnt.Type == "bind" {
-		for _, o := range mnt.Options {
-			if ContainsStr(invalidOptions, o) {
-				return fmt.Errorf("mount option %q is not supported: %v", o, mnt)
-			}
-			_, ok1 := optionsMap[o]
-			_, ok2 := propOptionsMap[o]
-			if !ok1 && !ok2 {
-				return fmt.Errorf("unknown mount option %q", o)
-			}
+		return ValidateMountOptions(mnt.Options)
+	}
+	return nil
+}
+
+// ValidateMountOptions validates that mount options are correct.
+func ValidateMountOptions(opts []string) error {
+	for _, o := range opts {
+		if ContainsStr(invalidOptions, o) {
+			return fmt.Errorf("mount option %q is not supported", o)
+		}
+		_, ok1 := optionsMap[o]
+		_, ok2 := propOptionsMap[o]
+		if !ok1 && !ok2 {
+			return fmt.Errorf("unknown mount option %q", o)
+		}
+		if err := validatePropagation(o); err != nil {
+			return err
 		}
 	}
 	return nil
@@ -133,5 +142,14 @@ func validateRootfsPropagation(opt string) error {
 	if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
 		return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
 	}
+	return validatePropagation(opt)
+}
+
+func validatePropagation(opt string) error {
+	flags := PropOptionsToFlags([]string{opt})
+	exclusive := flags & (syscall.MS_SLAVE | syscall.MS_PRIVATE | syscall.MS_SHARED | syscall.MS_UNBINDABLE)
+	if bits.OnesCount32(exclusive) > 1 {
+		return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt)
+	}
 	return nil
 }
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 7d194335c..06c13d1ab 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -220,3 +220,55 @@ func HasCapabilities(cs ...capability.Cap) bool {
 	}
 	return true
 }
+
+// MaybeRunAsRoot ensures the process runs with capabilities needed to create a
+// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
+// it will create a new user namespace and re-execute the process as root
+// inside the namespace with the same arguments and environment.
+//
+// This function returns immediately when no new capability is needed. If
+// another process is executed, it returns straight from here with the same exit
+// code as the child.
+func MaybeRunAsRoot() error {
+	if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
+		return nil
+	}
+
+	// Current process doesn't have required capabilities, create user namespace
+	// and run as root inside the namespace to acquire capabilities.
+	log.Infof("*** Re-running as root in new user namespace ***")
+
+	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
+
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+		// Set current user/group as root inside the namespace. Since we may not
+		// have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
+		UidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
+		},
+		GidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
+		},
+		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
+		GidMappingsEnableSetgroups: false,
+	}
+
+	cmd.Env = os.Environ()
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		if exit, ok := err.(*exec.ExitError); ok {
+			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
+				os.Exit(ws.ExitStatus())
+			}
+			log.Warningf("No wait status provided, exiting with -1: %v", err)
+			os.Exit(-1)
+		}
+		return fmt.Errorf("re-executing self: %v", err)
+	}
+	// Child completed with success.
+	os.Exit(0)
+	panic("unreachable")
+}