9 files changed, 221 insertions, 52 deletions
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 87a47dd0b..28a1600cd 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -214,10 +214,11 @@ type Config struct {
 	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
 	PanicSignal int
 
-	// TestOnlyAllowRunAsCurrentUser should only be used in tests. It
-	// allows runsc to start the sandbox process as the current user if we
-	// do not have capability to set uid/gid to another user.
-	TestOnlyAllowRunAsCurrentUser bool
+	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+	// tests. It allows runsc to start the sandbox process as the current
+	// user, and without chrooting the sandbox process. This can be
+	// necessary in test environments that have limited capabilities.
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
diff --git a/runsc/container/fs.go b/runsc/container/fs.go
index fb352fc7c..a3c5772ba 100644
--- a/runsc/container/fs.go
+++ b/runsc/container/fs.go
@@ -77,11 +77,6 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
 			continue
 		}
-		src := m.Source
-		srcfi, err := os.Stat(src)
-		if err != nil {
-			return fmt.Errorf("failed to stat() mount source: %v", err)
-		}
 
 		// It's possible that 'm.Destination' follows symlinks inside the
 		// container.
@@ -90,30 +85,13 @@ func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error {
 			return fmt.Errorf("failed to resolve symlinks: %v", err)
 		}
 
-		// Create mount point if it doesn't exits
-		if _, err := os.Stat(dst); os.IsNotExist(err) {
-			if srcfi.IsDir() {
-				if err := os.MkdirAll(dst, 0755); err != nil {
-					return fmt.Errorf("failed to make mount directory %q: %v", dst, err)
-				}
-			} else {
-				if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
-					return fmt.Errorf("failed to make mount directory for file %q: %v", filepath.Dir(dst), err)
-				}
-				f, err := os.OpenFile(dst, os.O_CREATE, 0755)
-				if err != nil {
-					return fmt.Errorf("failed to open mount file %q: %v", dst, err)
-				}
-				f.Close()
-			}
-		}
-
 		flags := optionsToFlags(m.Options)
 		flags |= syscall.MS_BIND
-		log.Infof("Mounting src: %q, dst: %q, flags: %#x", src, dst, flags)
-		if err := syscall.Mount(src, dst, m.Type, uintptr(flags), ""); err != nil {
-			return fmt.Errorf("failed to mount src: %q, dst: %q, flags: %#x, err: %v", src, dst, flags, err)
+		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
+		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
+			return fmt.Errorf("failed to mount %v: %v", m, err)
 		}
+
 		// Make the mount a slave, so that for recursive bind mount, umount won't
 		// propagate to the source.
 		flags = syscall.MS_SLAVE | syscall.MS_REC
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 9317b1c14..8ebd14c4e 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
     name = "sandbox",
     srcs = [
+        "chroot.go",
         "network.go",
         "sandbox.go",
     ],
diff --git a/runsc/sandbox/chroot.go b/runsc/sandbox/chroot.go
new file mode 100644
index 000000000..a77a186c2
--- /dev/null
+++ b/runsc/sandbox/chroot.go
@@ -0,0 +1,120 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// chrootBinPath is the location inside the chroot where the runsc binary will
+// be mounted.
+const chrootBinPath = "/runsc"
+
+// mountInChroot creates the destination mount point in the given chroot and
+// mounts the source.
+func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
+	chrootDst := filepath.Join(chroot, dst)
+	log.Infof("Mounting %q at %q", src, chrootDst)
+
+	return specutils.Mount(src, chrootDst, typ, flags)
+}
+
+// setUpChroot creates an empty directory with runsc mounted at /runsc, proc
+// mounted at /proc, and any dev files needed for the platform.
+func setUpChroot(platform boot.PlatformType) (string, error) {
+	// Create the chroot directory and make it accessible to all users.
+	chroot, err := ioutil.TempDir("", "runsc-sandbox-chroot-")
+	if err != nil {
+		return "", fmt.Errorf("TempDir() failed: %v", err)
+	}
+	if err := os.Chmod(chroot, 0777); err != nil {
+		return "", fmt.Errorf("Chmod(%q) failed: %v", chroot, err)
+	}
+	log.Infof("Setting up sandbox chroot in %q", chroot)
+
+	// Mount /proc.
+	if err := mountInChroot(chroot, "proc", "/proc", "proc", 0); err != nil {
+		return "", fmt.Errorf("error mounting proc in chroot: %v", err)
+	}
+
+	// Mount runsc at /runsc in the chroot.
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		return "", err
+	}
+	if err := mountInChroot(chroot, binPath, chrootBinPath, "bind", syscall.MS_BIND|syscall.MS_RDONLY); err != nil {
+		return "", fmt.Errorf("error mounting runsc in chroot: %v", err)
+	}
+
+	// Mount dev files needed for platform.
+	var devMount string
+	switch platform {
+	case boot.PlatformKVM:
+		devMount = "/dev/kvm"
+	}
+	if devMount != "" {
+		if err := mountInChroot(chroot, devMount, devMount, "bind", syscall.MS_BIND); err != nil {
+			return "", fmt.Errorf("error mounting platform device in chroot: %v", err)
+		}
+	}
+
+	return chroot, nil
+}
+
+// tearDownChroot unmounts /proc and /runsc from the chroot before deleting the
+// directory.
+func tearDownChroot(chroot string) error {
+	// Unmount /proc.
+	proc := filepath.Join(chroot, "proc")
+	if err := syscall.Unmount(proc, 0); err != nil {
+		return fmt.Errorf("error unmounting %q: %v", proc, err)
+	}
+
+	// Unmount /runsc.
+	exe := filepath.Join(chroot, chrootBinPath)
+	if err := syscall.Unmount(exe, 0); err != nil {
+		return fmt.Errorf("error unmounting %q: %v", exe, err)
+	}
+
+	// Unmount platform dev files.
+	devFiles := []string{"dev/kvm"}
+	for _, f := range devFiles {
+		devPath := filepath.Join(chroot, f)
+		if _, err := os.Stat(devPath); err != nil {
+			if os.IsNotExist(err) {
+				continue
+			}
+			return fmt.Errorf("Stat(%q) failed: %v", devPath, err)
+		}
+		if err := syscall.Unmount(devPath, 0); err != nil {
+			return fmt.Errorf("error unmounting %q: %v", devPath, err)
+		}
+	}
+
+	// Remove chroot directory.
+	if err := os.RemoveAll(chroot); err != nil {
+		return fmt.Errorf("error removing %q: %v", chroot, err)
+	}
+
+	return nil
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index dd5a0aa56..f6264d5b2 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -51,6 +51,10 @@ type Sandbox struct {
 	// Pid is the pid of the running sandbox (immutable). May be 0 is the sandbox
 	// is not running.
 	Pid int `json:"pid"`
+
+	// Chroot is the path to the chroot directory that the sandbox process
+	// is running in.
+	Chroot string `json:"chroot"`
 }
 
 // Create creates the sandbox process.
@@ -392,12 +396,11 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		log.Infof("Sandbox will be started in new user namespace")
 		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
 
-		if conf.TestOnlyAllowRunAsCurrentUser {
+		// If we have CAP_SETUID and CAP_SETGID, then we can also run
+		// as user nobody.
+		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
 		} else if specutils.CanSetUIDGID() {
-			// If we have CAP_SETUID and CAP_SETGID, then we can also run
-			// as user nobody.
-
 			// Map nobody in the new namespace to nobody in the parent namespace.
 			const nobody = 65534
 			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{{
@@ -419,6 +422,23 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		} else {
 			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
 		}
+
+		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
+		// bind-mount the executable inside it.
+		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
+		} else if specutils.HasCapSysAdmin() {
+			log.Infof("Sandbox will be started in minimal chroot")
+			chroot, err := setUpChroot(conf.Platform)
+			if err != nil {
+				return fmt.Errorf("error setting up chroot: %v", err)
+			}
+			cmd.SysProcAttr.Chroot = chroot
+			cmd.Args[0] = "/runsc"
+			cmd.Path = "/runsc"
+		} else {
+			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
+		}
 	}
 
 	// Log the fds we are donating to the sandbox process.
@@ -525,6 +545,11 @@ func (s *Sandbox) Destroy() error {
 		log.Debugf("Killing sandbox %q", s.ID)
 		signalProcess(s.Pid, unix.SIGKILL)
 	}
+
+	if s.Chroot != "" {
+		return tearDownChroot(s.Chroot)
+	}
+
 	return nil
 }
 
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 356943a65..48a199a77 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -216,3 +216,15 @@ func CanSetUIDGID() bool {
 	return caps.Get(capability.EFFECTIVE, capability.CAP_SETUID) &&
 		caps.Get(capability.EFFECTIVE, capability.CAP_SETGID)
 }
+
+// HasCapSysAdmin returns true if the user has CAP_SYS_ADMIN capability.
+func HasCapSysAdmin() bool {
+	caps, err := capability.NewPid2(os.Getpid())
+	if err != nil {
+		return false
+	}
+	if err := caps.Load(); err != nil {
+		return false
+	}
+	return caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN)
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 551718e9a..f3fa8d129 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -363,3 +363,44 @@ func DebugLogFile(logDir, subcommand string) (*os.File, error) {
 	filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), subcommand)
 	return os.OpenFile(filepath.Join(logDir, filename), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
 }
+
+// Mount creates the mount point and calls Mount with the given flags.
+func Mount(src, dst, typ string, flags uint32) error {
+	// Create the mount point inside. The type must be the same as the
+	// source (file or directory).
+	var isDir bool
+	if typ == "proc" {
+		// Special case, as there is no source directory for proc
+		// mounts.
+		isDir = true
+	} else if fi, err := os.Stat(src); err != nil {
+		return fmt.Errorf("Stat(%q) failed: %v", src, err)
+	} else {
+		isDir = fi.IsDir()
+	}
+
+	if isDir {
+		// Create the destination directory.
+		if err := os.MkdirAll(dst, 0777); err != nil {
+			return fmt.Errorf("Mkdir(%q) failed: %v", dst, err)
+		}
+	} else {
+		// Create the parent destination directory.
+		parent := path.Dir(dst)
+		if err := os.MkdirAll(parent, 0777); err != nil {
+			return fmt.Errorf("Mkdir(%q) failed: %v", parent, err)
+		}
+		// Create the destination file if it does not exist.
+		f, err := os.OpenFile(dst, syscall.O_CREAT, 0777)
+		if err != nil {
+			return fmt.Errorf("Open(%q) failed: %v", dst, err)
+		}
+		f.Close()
+	}
+
+	// Do the mount.
+	if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil {
+		return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err)
+	}
+	return nil
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index ca91e07ff..03ab3c4ac 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -18,6 +18,5 @@ go_library(
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 77bd56912..4f012a8ea 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -32,7 +32,6 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/syndtr/gocapability/capability"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -104,14 +103,14 @@ func FindFile(path string) (string, error) {
 // TestConfig return the default configuration to use in tests.
 func TestConfig() *boot.Config {
 	return &boot.Config{
-		Debug:                         true,
-		LogFormat:                     "text",
-		LogPackets:                    true,
-		Network:                       boot.NetworkNone,
-		Strace:                        true,
-		MultiContainer:                true,
-		FileAccess:                    boot.FileAccessProxyExclusive,
-		TestOnlyAllowRunAsCurrentUser: true,
+		Debug:          true,
+		LogFormat:      "text",
+		LogPackets:     true,
+		Network:        boot.NetworkNone,
+		Strace:         true,
+		MultiContainer: true,
+		FileAccess:     boot.FileAccessProxyExclusive,
+		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
 	}
 }
 
@@ -238,14 +237,7 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 // RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create
 // a new user namespace and reexecute the test as root inside of the namespace.
 func RunAsRoot(m *testing.M) {
-	caps, err := capability.NewPid2(os.Getpid())
-	if err != nil {
-		panic(err.Error())
-	}
-	if err := caps.Load(); err != nil {
-		panic(err.Error())
-	}
-	if caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN) {
+	if specutils.HasCapSysAdmin() {
 		// Capability: check! Good to run.
 		os.Exit(m.Run())
 	}