// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "os" "runtime/debug" "strings" "syscall" "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) // Boot implements subcommands.Command for the "boot" command which starts a // new sandbox. It should not be called directly. type Boot struct { // bundleDir is the directory containing the OCI spec. bundleDir string // specFD is the file descriptor that the spec will be read from. specFD int // controllerFD is the file descriptor of a stream socket for the // control server that is donated to this process. controllerFD int // deviceFD is the file descriptor for the platform device file. deviceFD int // ioFDs is the list of FDs used to connect to FS gofers. ioFDs intFlags // stdioFDs are the fds for stdin, stdout, and stderr. They must be // provided in that order. stdioFDs intFlags // applyCaps determines if capabilities defined in the spec should be applied // to the process. applyCaps bool // setUpChroot is set to true if the sandbox is started in an empty root. setUpRoot bool // cpuNum number of CPUs to create inside the sandbox. cpuNum int // totalMem sets the initial amount of total memory to report back to the // container. totalMem uint64 // userLogFD is the file descriptor to write user logs to. userLogFD int // startSyncFD is the file descriptor to synchronize runsc and sandbox. startSyncFD int // mountsFD is the file descriptor to read list of mounts after they have // been resolved (direct paths, no symlinks). They are resolved outside the // sandbox (e.g. gofer) and sent through this FD. mountsFD int // pidns is set if the sandbox is in its own pid namespace. pidns bool // attached is set to true to kill the sandbox process when the parent process // terminates. This flag is set when the command execve's itself because // parent death signal doesn't propagate through execve when uid/gid changes. attached bool } // Name implements subcommands.Command.Name. func (*Boot) Name() string { return "boot" } // Synopsis implements subcommands.Command.Synopsis. func (*Boot) Synopsis() string { return "launch a sandbox process (internal use only)" } // Usage implements subcommands.Command.Usage. func (*Boot) Usage() string { return `boot [flags] <container id>` } // SetFlags implements subcommands.Command.SetFlags. func (b *Boot) SetFlags(f *flag.FlagSet) { f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory") f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec") f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec") f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") } // Execute implements subcommands.Command.Execute. It starts a sandbox in a // waiting state. func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } // Ensure that if there is a panic, all goroutine stacks are printed. debug.SetTraceback("system") conf := args[0].(*boot.Config) if b.attached { // Ensure this process is killed after parent process terminates when // attached mode is enabled. In the unfortunate event that the parent // terminates before this point, this process leaks. if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil { Fatalf("error setting parent death signal: %v", err) } } if b.setUpRoot { if err := setUpChroot(b.pidns); err != nil { Fatalf("error setting up chroot: %v", err) } if !b.applyCaps && !conf.Rootless { // Remove --apply-caps arg to call myself. It has already been done. args := prepareArgs(b.attached, "setup-root") // Note that we've already read the spec from the spec FD, and // we will read it again after the exec call. This works // because the ReadSpecFromFile function seeks to the beginning // of the file before reading. if err := callSelfAsNobody(args); err != nil { Fatalf("%v", err) } panic("callSelfAsNobody must never return success") } } // Get the spec from the specFD. specFile := os.NewFile(uintptr(b.specFD), "spec file") defer specFile.Close() spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile) if err != nil { Fatalf("reading spec: %v", err) } specutils.LogSpec(spec) if b.applyCaps { caps := spec.Process.Capabilities if caps == nil { caps = &specs.LinuxCapabilities{} } gPlatform, err := platform.Lookup(conf.Platform) if err != nil { Fatalf("loading platform: %v", err) } if gPlatform.Requirements().RequiresCapSysPtrace { // Ptrace platform requires extra capabilities. const c = "CAP_SYS_PTRACE" caps.Bounding = append(caps.Bounding, c) caps.Effective = append(caps.Effective, c) caps.Permitted = append(caps.Permitted, c) } // Remove --apply-caps and --setup-root arg to call myself. Both have // already been done. args := prepareArgs(b.attached, "setup-root", "apply-caps") // Note that we've already read the spec from the spec FD, and // we will read it again after the exec call. This works // because the ReadSpecFromFile function seeks to the beginning // of the file before reading. if err := setCapsAndCallSelf(args, caps); err != nil { Fatalf("%v", err) } panic("setCapsAndCallSelf must never return success") } // Read resolved mount list and replace the original one from the spec. mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file") cleanMounts, err := specutils.ReadMounts(mountsFile) if err != nil { mountsFile.Close() Fatalf("Error reading mounts file: %v", err) } mountsFile.Close() spec.Mounts = cleanMounts // Create the loader. bootArgs := boot.Args{ ID: f.Arg(0), Spec: spec, Conf: conf, ControllerFD: b.controllerFD, Device: os.NewFile(uintptr(b.deviceFD), "platform device"), GoferFDs: b.ioFDs.GetArray(), StdioFDs: b.stdioFDs.GetArray(), NumCPU: b.cpuNum, TotalMem: b.totalMem, UserLogFD: b.userLogFD, } l, err := boot.New(bootArgs) if err != nil { Fatalf("creating loader: %v", err) } // Fatalf exits the process and doesn't run defers. // 'l' must be destroyed explicitly after this point! // Notify the parent process the sandbox has booted (and that the controller // is up). startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file") buf := make([]byte, 1) if w, err := startSyncFile.Write(buf); err != nil || w != 1 { l.Destroy() Fatalf("unable to write into the start-sync descriptor: %v", err) } // Closes startSyncFile because 'l.Run()' only returns when the sandbox exits. startSyncFile.Close() // Wait for the start signal from runsc. l.WaitForStartSignal() // Run the application and wait for it to finish. if err := l.Run(); err != nil { l.Destroy() Fatalf("running sandbox: %v", err) } ws := l.WaitExit() log.Infof("application exiting with %+v", ws) waitStatus := args[1].(*syscall.WaitStatus) *waitStatus = syscall.WaitStatus(ws.Status()) l.Destroy() return subcommands.ExitSuccess } func prepareArgs(attached bool, exclude ...string) []string { var args []string for _, arg := range os.Args { for _, excl := range exclude { if strings.Contains(arg, excl) { goto skip } } args = append(args, arg) if attached && arg == "boot" { // Strategicaly place "--attached" after the command. This is needed // to ensure the new process is killed when the parent process terminates. args = append(args, "--attached") } skip: } return args }