1 files changed, 290 insertions, 0 deletions
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
new file mode 100644
index 000000000..01204ab4d
--- /dev/null
+++ b/runsc/cmd/boot.go
@@ -0,0 +1,290 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"os"
+	"runtime/debug"
+	"strings"
+	"syscall"
+
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// Boot implements subcommands.Command for the "boot" command which starts a
+// new sandbox. It should not be called directly.
+type Boot struct {
+	// bundleDir is the directory containing the OCI spec.
+	bundleDir string
+
+	// specFD is the file descriptor that the spec will be read from.
+	specFD int
+
+	// controllerFD is the file descriptor of a stream socket for the
+	// control server that is donated to this process.
+	controllerFD int
+
+	// deviceFD is the file descriptor for the platform device file.
+	deviceFD int
+
+	// ioFDs is the list of FDs used to connect to FS gofers.
+	ioFDs intFlags
+
+	// stdioFDs are the fds for stdin, stdout, and stderr. They must be
+	// provided in that order.
+	stdioFDs intFlags
+
+	// console is set to true if the sandbox should allow terminal ioctl(2)
+	// syscalls.
+	console bool
+
+	// applyCaps determines if capabilities defined in the spec should be applied
+	// to the process.
+	applyCaps bool
+
+	// setUpChroot is set to true if the sandbox is started in an empty root.
+	setUpRoot bool
+
+	// cpuNum number of CPUs to create inside the sandbox.
+	cpuNum int
+
+	// totalMem sets the initial amount of total memory to report back to the
+	// container.
+	totalMem uint64
+
+	// userLogFD is the file descriptor to write user logs to.
+	userLogFD int
+
+	// startSyncFD is the file descriptor to synchronize runsc and sandbox.
+	startSyncFD int
+
+	// mountsFD is the file descriptor to read list of mounts after they have
+	// been resolved (direct paths, no symlinks). They are resolved outside the
+	// sandbox (e.g. gofer) and sent through this FD.
+	mountsFD int
+
+	// pidns is set if the sandbox is in its own pid namespace.
+	pidns bool
+
+	// attached is set to true to kill the sandbox process when the parent process
+	// terminates. This flag is set when the command execve's itself because
+	// parent death signal doesn't propagate through execve when uid/gid changes.
+	attached bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Boot) Name() string {
+	return "boot"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Boot) Synopsis() string {
+	return "launch a sandbox process (internal use only)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Boot) Usage() string {
+	return `boot [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (b *Boot) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
+	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
+	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
+	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
+	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
+	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
+	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+	f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
+	f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
+	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
+	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
+	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
+	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
+	f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
+	f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
+}
+
+// Execute implements subcommands.Command.Execute.  It starts a sandbox in a
+// waiting state.
+func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	// Ensure that if there is a panic, all goroutine stacks are printed.
+	debug.SetTraceback("system")
+
+	conf := args[0].(*boot.Config)
+
+	if b.attached {
+		// Ensure this process is killed after parent process terminates when
+		// attached mode is enabled. In the unfortunate event that the parent
+		// terminates before this point, this process leaks.
+		if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
+			Fatalf("error setting parent death signal: %v", err)
+		}
+	}
+
+	if b.setUpRoot {
+		if err := setUpChroot(b.pidns); err != nil {
+			Fatalf("error setting up chroot: %v", err)
+		}
+
+		if !b.applyCaps && !conf.Rootless {
+			// Remove --apply-caps arg to call myself. It has already been done.
+			args := prepareArgs(b.attached, "setup-root")
+
+			// Note that we've already read the spec from the spec FD, and
+			// we will read it again after the exec call. This works
+			// because the ReadSpecFromFile function seeks to the beginning
+			// of the file before reading.
+			if err := callSelfAsNobody(args); err != nil {
+				Fatalf("%v", err)
+			}
+			panic("callSelfAsNobody must never return success")
+		}
+	}
+
+	// Get the spec from the specFD.
+	specFile := os.NewFile(uintptr(b.specFD), "spec file")
+	defer specFile.Close()
+	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile)
+	if err != nil {
+		Fatalf("reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	if b.applyCaps {
+		caps := spec.Process.Capabilities
+		if caps == nil {
+			caps = &specs.LinuxCapabilities{}
+		}
+
+		gPlatform, err := platform.Lookup(conf.Platform)
+		if err != nil {
+			Fatalf("loading platform: %v", err)
+		}
+		if gPlatform.Requirements().RequiresCapSysPtrace {
+			// Ptrace platform requires extra capabilities.
+			const c = "CAP_SYS_PTRACE"
+			caps.Bounding = append(caps.Bounding, c)
+			caps.Effective = append(caps.Effective, c)
+			caps.Permitted = append(caps.Permitted, c)
+		}
+
+		// Remove --apply-caps and --setup-root arg to call myself. Both have
+		// already been done.
+		args := prepareArgs(b.attached, "setup-root", "apply-caps")
+
+		// Note that we've already read the spec from the spec FD, and
+		// we will read it again after the exec call. This works
+		// because the ReadSpecFromFile function seeks to the beginning
+		// of the file before reading.
+		if err := setCapsAndCallSelf(args, caps); err != nil {
+			Fatalf("%v", err)
+		}
+		panic("setCapsAndCallSelf must never return success")
+	}
+
+	// Read resolved mount list and replace the original one from the spec.
+	mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file")
+	cleanMounts, err := specutils.ReadMounts(mountsFile)
+	if err != nil {
+		mountsFile.Close()
+		Fatalf("Error reading mounts file: %v", err)
+	}
+	mountsFile.Close()
+	spec.Mounts = cleanMounts
+
+	// Create the loader.
+	bootArgs := boot.Args{
+		ID:           f.Arg(0),
+		Spec:         spec,
+		Conf:         conf,
+		ControllerFD: b.controllerFD,
+		Device:       os.NewFile(uintptr(b.deviceFD), "platform device"),
+		GoferFDs:     b.ioFDs.GetArray(),
+		StdioFDs:     b.stdioFDs.GetArray(),
+		Console:      b.console,
+		NumCPU:       b.cpuNum,
+		TotalMem:     b.totalMem,
+		UserLogFD:    b.userLogFD,
+	}
+	l, err := boot.New(bootArgs)
+	if err != nil {
+		Fatalf("creating loader: %v", err)
+	}
+
+	// Fatalf exits the process and doesn't run defers.
+	// 'l' must be destroyed explicitly after this point!
+
+	// Notify the parent process the sandbox has booted (and that the controller
+	// is up).
+	startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
+	buf := make([]byte, 1)
+	if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
+		l.Destroy()
+		Fatalf("unable to write into the start-sync descriptor: %v", err)
+	}
+	// Closes startSyncFile because 'l.Run()' only returns when the sandbox exits.
+	startSyncFile.Close()
+
+	// Wait for the start signal from runsc.
+	l.WaitForStartSignal()
+
+	// Run the application and wait for it to finish.
+	if err := l.Run(); err != nil {
+		l.Destroy()
+		Fatalf("running sandbox: %v", err)
+	}
+
+	ws := l.WaitExit()
+	log.Infof("application exiting with %+v", ws)
+	waitStatus := args[1].(*syscall.WaitStatus)
+	*waitStatus = syscall.WaitStatus(ws.Status())
+	l.Destroy()
+	return subcommands.ExitSuccess
+}
+
+func prepareArgs(attached bool, exclude ...string) []string {
+	var args []string
+	for _, arg := range os.Args {
+		for _, excl := range exclude {
+			if strings.Contains(arg, excl) {
+				goto skip
+			}
+		}
+		args = append(args, arg)
+		if attached && arg == "boot" {
+			// Strategicaly place "--attached" after the command. This is needed
+			// to ensure the new process is killed when the parent process terminates.
+			args = append(args, "--attached")
+		}
+	skip:
+	}
+	return args
+}