summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2019-06-02 06:44:55 +0000
committergVisor bot <gvisor-bot@google.com>2019-06-02 06:44:55 +0000
commitceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree83155f302eff44a78bcc30a3a08f4efe59a79379 /runsc
parentdeb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
Merge 216da0b7 (automated)
Diffstat (limited to 'runsc')
-rw-r--r--runsc/boot/compat.go159
-rw-r--r--runsc/boot/compat_amd64.go77
-rw-r--r--runsc/boot/config.go253
-rw-r--r--runsc/boot/controller.go491
-rw-r--r--runsc/boot/debug.go29
-rw-r--r--runsc/boot/events.go81
-rw-r--r--runsc/boot/fds.go89
-rw-r--r--runsc/boot/filter/config.go493
-rw-r--r--runsc/boot/filter/extra_filters.go28
-rw-r--r--runsc/boot/filter/extra_filters_msan.go32
-rw-r--r--runsc/boot/filter/extra_filters_race.go40
-rw-r--r--runsc/boot/filter/filter.go71
-rw-r--r--runsc/boot/fs.go774
-rw-r--r--runsc/boot/limits.go154
-rw-r--r--runsc/boot/loader.go954
-rw-r--r--runsc/boot/network.go222
-rw-r--r--runsc/boot/strace.go40
-rw-r--r--runsc/cgroup/cgroup.go503
-rw-r--r--runsc/cmd/boot.go257
-rw-r--r--runsc/cmd/capability.go157
-rw-r--r--runsc/cmd/checkpoint.go150
-rw-r--r--runsc/cmd/chroot.go97
-rw-r--r--runsc/cmd/cmd.go117
-rw-r--r--runsc/cmd/create.go103
-rw-r--r--runsc/cmd/debug.go185
-rw-r--r--runsc/cmd/delete.go87
-rw-r--r--runsc/cmd/do.go310
-rw-r--r--runsc/cmd/events.go111
-rw-r--r--runsc/cmd/exec.go486
-rw-r--r--runsc/cmd/gofer.go446
-rw-r--r--runsc/cmd/kill.go154
-rw-r--r--runsc/cmd/list.go117
-rw-r--r--runsc/cmd/path.go28
-rw-r--r--runsc/cmd/pause.go68
-rw-r--r--runsc/cmd/ps.go86
-rw-r--r--runsc/cmd/restore.go106
-rw-r--r--runsc/cmd/resume.go69
-rw-r--r--runsc/cmd/run.go87
-rw-r--r--runsc/cmd/spec.go182
-rw-r--r--runsc/cmd/start.go65
-rw-r--r--runsc/cmd/state.go76
-rw-r--r--runsc/cmd/wait.go127
-rw-r--r--runsc/console/console.go63
-rw-r--r--runsc/container/container.go1053
-rw-r--r--runsc/container/hook.go111
-rw-r--r--runsc/container/status.go60
-rw-r--r--runsc/fsgofer/filter/config.go182
-rw-r--r--runsc/fsgofer/filter/extra_filters.go28
-rw-r--r--runsc/fsgofer/filter/extra_filters_msan.go33
-rw-r--r--runsc/fsgofer/filter/extra_filters_race.go42
-rw-r--r--runsc/fsgofer/filter/filter.go33
-rw-r--r--runsc/fsgofer/fsgofer.go1057
-rw-r--r--runsc/fsgofer/fsgofer_unsafe.go107
-rw-r--r--runsc/main.go279
-rw-r--r--runsc/sandbox/network.go375
-rw-r--r--runsc/sandbox/network_unsafe.go56
-rw-r--r--runsc/sandbox/sandbox.go992
-rw-r--r--runsc/specutils/fs.go137
-rw-r--r--runsc/specutils/namespace.go222
-rw-r--r--runsc/specutils/specutils.go494
-rw-r--r--runsc/version.go18
61 files changed, 13503 insertions, 0 deletions
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
new file mode 100644
index 000000000..c369e4d64
--- /dev/null
+++ b/runsc/boot/compat.go
@@ -0,0 +1,159 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "os"
+ "sync"
+ "syscall"
+
+ "github.com/golang/protobuf/proto"
+ "gvisor.googlesource.com/gvisor/pkg/abi"
+ "gvisor.googlesource.com/gvisor/pkg/eventchannel"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+ ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+ spb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+)
+
+func initCompatLogs(fd int) error {
+ ce, err := newCompatEmitter(fd)
+ if err != nil {
+ return err
+ }
+ eventchannel.AddEmitter(ce)
+ return nil
+}
+
+type compatEmitter struct {
+ sink *log.BasicLogger
+ nameMap strace.SyscallMap
+
+ // mu protects the fields below.
+ mu sync.Mutex
+
+ // trackers map syscall number to the respective tracker instance.
+ // Protected by 'mu'.
+ trackers map[uint64]syscallTracker
+}
+
+func newCompatEmitter(logFD int) (*compatEmitter, error) {
+ nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+ if !ok {
+ return nil, fmt.Errorf("amd64 Linux syscall table not found")
+ }
+
+ c := &compatEmitter{
+ // Always logs to default logger.
+ sink: log.Log(),
+ nameMap: nameMap,
+ trackers: make(map[uint64]syscallTracker),
+ }
+
+ if logFD > 0 {
+ f := os.NewFile(uintptr(logFD), "user log file")
+ target := log.MultiEmitter{c.sink, log.K8sJSONEmitter{log.Writer{Next: f}}}
+ c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
+ }
+ return c, nil
+}
+
+// Emit implements eventchannel.Emitter.
+func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
+ switch m := msg.(type) {
+ case *spb.UnimplementedSyscall:
+ c.emitUnimplementedSyscall(m)
+ case *ucspb.UncaughtSignal:
+ c.emitUncaughtSignal(m)
+ }
+
+ return false, nil
+}
+
+func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
+ regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+
+ c.mu.Lock()
+ defer c.mu.Unlock()
+
+ sysnr := regs.OrigRax
+ tr := c.trackers[sysnr]
+ if tr == nil {
+ switch sysnr {
+ case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+ // args: cmd, ...
+ tr = newArgsTracker(0)
+
+ case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX, syscall.SYS_FALLOCATE:
+ // args: fd/addr, cmd, ...
+ tr = newArgsTracker(1)
+
+ case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
+ // args: fd, level, name, ...
+ tr = newArgsTracker(1, 2)
+
+ case syscall.SYS_SEMCTL:
+ // args: semid, semnum, cmd, ...
+ tr = newArgsTracker(2)
+
+ default:
+ tr = &onceTracker{}
+ }
+ c.trackers[sysnr] = tr
+ }
+ if tr.shouldReport(regs) {
+ c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+ tr.onReported(regs)
+ }
+}
+
+func (c *compatEmitter) emitUncaughtSignal(msg *ucspb.UncaughtSignal) {
+ sig := syscall.Signal(msg.SignalNumber)
+ c.sink.Infof(
+ "Uncaught signal: %q (%d), PID: %d, TID: %d, fault addr: %#x",
+ sig, msg.SignalNumber, msg.Pid, msg.Tid, msg.FaultAddr)
+}
+
+// Close implements eventchannel.Emitter.
+func (c *compatEmitter) Close() error {
+ c.sink = nil
+ return nil
+}
+
+// syscallTracker interface allows filters to apply differently depending on
+// the syscall and arguments.
+type syscallTracker interface {
+ // shouldReport returns true is the syscall should be reported.
+ shouldReport(regs *rpb.AMD64Registers) bool
+
+ // onReported marks the syscall as reported.
+ onReported(regs *rpb.AMD64Registers)
+}
+
+// onceTracker reports only a single time, used for most syscalls.
+type onceTracker struct {
+ reported bool
+}
+
+func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+ return !o.reported
+}
+
+func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+ o.reported = true
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
new file mode 100644
index 000000000..99df5e614
--- /dev/null
+++ b/runsc/boot/compat_amd64.go
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+
+ rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+ // argsIdx is the syscall arguments to use as unique ID.
+ argsIdx []int
+ reported map[string]struct{}
+ count int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+ return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// cmd returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
+ var rv string
+ for _, idx := range a.argsIdx {
+ rv += fmt.Sprintf("%d|", argVal(idx, regs))
+ }
+ return rv
+}
+
+func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+ switch argIdx {
+ case 0:
+ return uint32(regs.Rdi)
+ case 1:
+ return uint32(regs.Rsi)
+ case 2:
+ return uint32(regs.Rdx)
+ case 3:
+ return uint32(regs.R10)
+ case 4:
+ return uint32(regs.R8)
+ case 5:
+ return uint32(regs.R9)
+ }
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
+ if a.count >= reportLimit {
+ return false
+ }
+ _, ok := a.reported[a.key(regs)]
+ return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
+ a.count++
+ a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
new file mode 100644
index 000000000..15f624f9b
--- /dev/null
+++ b/runsc/boot/config.go
@@ -0,0 +1,253 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+)
+
+// PlatformType tells which platform to use.
+type PlatformType int
+
+const (
+ // PlatformPtrace runs the sandbox with the ptrace platform.
+ PlatformPtrace PlatformType = iota
+
+ // PlatformKVM runs the sandbox with the KVM platform.
+ PlatformKVM
+)
+
+// MakePlatformType converts type from string.
+func MakePlatformType(s string) (PlatformType, error) {
+ switch s {
+ case "ptrace":
+ return PlatformPtrace, nil
+ case "kvm":
+ return PlatformKVM, nil
+ default:
+ return 0, fmt.Errorf("invalid platform type %q", s)
+ }
+}
+
+func (p PlatformType) String() string {
+ switch p {
+ case PlatformPtrace:
+ return "ptrace"
+ case PlatformKVM:
+ return "kvm"
+ default:
+ return fmt.Sprintf("unknown(%d)", p)
+ }
+}
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+ // FileAccessShared sends IO requests to a Gofer process that validates the
+ // requests and forwards them to the host.
+ FileAccessShared FileAccessType = iota
+
+ // FileAccessExclusive is the same as FileAccessShared, but enables
+ // extra caching for improved performance. It should only be used if
+ // the sandbox has exclusive access to the filesystem.
+ FileAccessExclusive
+)
+
+// MakeFileAccessType converts type from string.
+func MakeFileAccessType(s string) (FileAccessType, error) {
+ switch s {
+ case "shared":
+ return FileAccessShared, nil
+ case "exclusive":
+ return FileAccessExclusive, nil
+ default:
+ return 0, fmt.Errorf("invalid file access type %q", s)
+ }
+}
+
+func (f FileAccessType) String() string {
+ switch f {
+ case FileAccessShared:
+ return "shared"
+ case FileAccessExclusive:
+ return "exclusive"
+ default:
+ return fmt.Sprintf("unknown(%d)", f)
+ }
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+ // NetworkSandbox uses internal network stack, isolated from the host.
+ NetworkSandbox NetworkType = iota
+
+ // NetworkHost redirects network related syscalls to the host network.
+ NetworkHost
+
+ // NetworkNone sets up just loopback using netstack.
+ NetworkNone
+)
+
+// MakeNetworkType converts type from string.
+func MakeNetworkType(s string) (NetworkType, error) {
+ switch s {
+ case "sandbox":
+ return NetworkSandbox, nil
+ case "host":
+ return NetworkHost, nil
+ case "none":
+ return NetworkNone, nil
+ default:
+ return 0, fmt.Errorf("invalid network type %q", s)
+ }
+}
+
+func (n NetworkType) String() string {
+ switch n {
+ case NetworkSandbox:
+ return "sandbox"
+ case NetworkHost:
+ return "host"
+ case NetworkNone:
+ return "none"
+ default:
+ return fmt.Sprintf("unknown(%d)", n)
+ }
+}
+
+// MakeWatchdogAction converts type from string.
+func MakeWatchdogAction(s string) (watchdog.Action, error) {
+ switch strings.ToLower(s) {
+ case "log", "logwarning":
+ return watchdog.LogWarning, nil
+ case "panic":
+ return watchdog.Panic, nil
+ default:
+ return 0, fmt.Errorf("invalid watchdog action %q", s)
+ }
+}
+
+// Config holds configuration that is not part of the runtime spec.
+type Config struct {
+ // RootDir is the runtime root directory.
+ RootDir string
+
+ // Debug indicates that debug logging should be enabled.
+ Debug bool
+
+ // LogFilename is the filename to log to, if not empty.
+ LogFilename string
+
+ // LogFormat is the log format.
+ LogFormat string
+
+ // DebugLog is the path to log debug information to, if not empty.
+ DebugLog string
+
+ // DebugLogFormat is the log format for debug.
+ DebugLogFormat string
+
+ // FileAccess indicates how the filesystem is accessed.
+ FileAccess FileAccessType
+
+ // Overlay is whether to wrap the root filesystem in an overlay.
+ Overlay bool
+
+ // Network indicates what type of network to use.
+ Network NetworkType
+
+ // EnableRaw indicates whether raw sockets should be enabled. Raw
+ // sockets are disabled by stripping CAP_NET_RAW from the list of
+ // capabilities.
+ EnableRaw bool
+
+ // GSO indicates that generic segmentation offload is enabled.
+ GSO bool
+
+ // LogPackets indicates that all network packets should be logged.
+ LogPackets bool
+
+ // Platform is the platform to run on.
+ Platform PlatformType
+
+ // Strace indicates that strace should be enabled.
+ Strace bool
+
+ // StraceSyscalls is the set of syscalls to trace. If StraceEnable is
+ // true and this list is empty, then all syscalls will be traced.
+ StraceSyscalls []string
+
+ // StraceLogSize is the max size of data blobs to display.
+ StraceLogSize uint
+
+ // DisableSeccomp indicates whether seccomp syscall filters should be
+ // disabled. Pardon the double negation, but default to enabled is important.
+ DisableSeccomp bool
+
+ // WatchdogAction sets what action the watchdog takes when triggered.
+ WatchdogAction watchdog.Action
+
+ // PanicSignal registers signal handling that panics. Usually set to
+ // SIGUSR2(12) to troubleshoot hangs. -1 disables it.
+ PanicSignal int
+
+ // ProfileEnable is set to prepare the sandbox to be profiled.
+ ProfileEnable bool
+
+ // RestoreFile is the path to the saved container image
+ RestoreFile string
+
+ // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+ // tests. It allows runsc to start the sandbox process as the current
+ // user, and without chrooting the sandbox process. This can be
+ // necessary in test environments that have limited capabilities.
+ TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+ f := []string{
+ "--root=" + c.RootDir,
+ "--debug=" + strconv.FormatBool(c.Debug),
+ "--log=" + c.LogFilename,
+ "--log-format=" + c.LogFormat,
+ "--debug-log=" + c.DebugLog,
+ "--debug-log-format=" + c.DebugLogFormat,
+ "--file-access=" + c.FileAccess.String(),
+ "--overlay=" + strconv.FormatBool(c.Overlay),
+ "--network=" + c.Network.String(),
+ "--log-packets=" + strconv.FormatBool(c.LogPackets),
+ "--platform=" + c.Platform.String(),
+ "--strace=" + strconv.FormatBool(c.Strace),
+ "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
+ "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
+ "--watchdog-action=" + c.WatchdogAction.String(),
+ "--panic-signal=" + strconv.Itoa(c.PanicSignal),
+ "--profile=" + strconv.FormatBool(c.ProfileEnable),
+ "--net-raw=" + strconv.FormatBool(c.EnableRaw),
+ }
+ if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+ // Only include if set since it is never to be used by users.
+ f = append(f, "-TESTONLY-unsafe-nonroot=true")
+ }
+ return f
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
new file mode 100644
index 000000000..72ab9ef86
--- /dev/null
+++ b/runsc/boot/controller.go
@@ -0,0 +1,491 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "path"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/control/server"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/state"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+const (
+ // ContainerCheckpoint checkpoints a container.
+ ContainerCheckpoint = "containerManager.Checkpoint"
+
+ // ContainerCreate creates a container.
+ ContainerCreate = "containerManager.Create"
+
+ // ContainerDestroy is used to stop a non-root container and free all
+ // associated resources in the sandbox.
+ ContainerDestroy = "containerManager.Destroy"
+
+ // ContainerEvent is the URPC endpoint for getting stats about the
+ // container used by "runsc events".
+ ContainerEvent = "containerManager.Event"
+
+ // ContainerExecuteAsync is the URPC endpoint for executing a command in a
+ // container..
+ ContainerExecuteAsync = "containerManager.ExecuteAsync"
+
+ // ContainerPause pauses the container.
+ ContainerPause = "containerManager.Pause"
+
+ // ContainerProcesses is the URPC endpoint for getting the list of
+ // processes running in a container.
+ ContainerProcesses = "containerManager.Processes"
+
+ // ContainerRestore restores a container from a statefile.
+ ContainerRestore = "containerManager.Restore"
+
+ // ContainerResume unpauses the paused container.
+ ContainerResume = "containerManager.Resume"
+
+ // ContainerSignal is used to send a signal to a container.
+ ContainerSignal = "containerManager.Signal"
+
+ // ContainerSignalProcess is used to send a signal to a particular
+ // process in a container.
+ ContainerSignalProcess = "containerManager.SignalProcess"
+
+ // ContainerStart is the URPC endpoint for running a non-root container
+ // within a sandbox.
+ ContainerStart = "containerManager.Start"
+
+ // ContainerWait is used to wait on the init process of the container
+ // and return its ExitStatus.
+ ContainerWait = "containerManager.Wait"
+
+ // ContainerWaitPID is used to wait on a process with a certain PID in
+ // the sandbox and return its ExitStatus.
+ ContainerWaitPID = "containerManager.WaitPID"
+
+ // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
+ // and routes in a network stack.
+ NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
+
+ // RootContainerStart is the URPC endpoint for starting a new sandbox
+ // with root container.
+ RootContainerStart = "containerManager.StartRoot"
+
+ // SandboxStacks collects sandbox stacks for debugging.
+ SandboxStacks = "debug.Stacks"
+
+ // Profiling related commands (see pprof.go for more details).
+ StartCPUProfile = "Profile.StartCPUProfile"
+ StopCPUProfile = "Profile.StopCPUProfile"
+ HeapProfile = "Profile.HeapProfile"
+ StartTrace = "Profile.StartTrace"
+ StopTrace = "Profile.StopTrace"
+)
+
+// ControlSocketAddr generates an abstract unix socket name for the given ID.
+func ControlSocketAddr(id string) string {
+ return fmt.Sprintf("\x00runsc-sandbox.%s", id)
+}
+
+// controller holds the control server, and is used for communication into the
+// sandbox.
+type controller struct {
+ // srv is the control server.
+ srv *server.Server
+
+ // manager holds the containerManager methods.
+ manager *containerManager
+}
+
+// newController creates a new controller. The caller must call
+// controller.srv.StartServing() to start the controller.
+func newController(fd int, l *Loader) (*controller, error) {
+ srv, err := server.CreateFromFD(fd)
+ if err != nil {
+ return nil, err
+ }
+
+ manager := &containerManager{
+ startChan: make(chan struct{}),
+ startResultChan: make(chan error),
+ l: l,
+ }
+ srv.Register(manager)
+
+ if eps, ok := l.k.NetworkStack().(*epsocket.Stack); ok {
+ net := &Network{
+ Stack: eps.Stack,
+ }
+ srv.Register(net)
+ }
+
+ srv.Register(&debug{})
+ if l.conf.ProfileEnable {
+ srv.Register(&control.Profile{})
+ }
+
+ return &controller{
+ srv: srv,
+ manager: manager,
+ }, nil
+}
+
+// containerManager manages sandboes containers.
+type containerManager struct {
+ // startChan is used to signal when the root container process should
+ // be started.
+ startChan chan struct{}
+
+ // startResultChan is used to signal when the root container has
+ // started. Any errors encountered during startup will be sent to the
+ // channel. A nil value indicates success.
+ startResultChan chan error
+
+ // l is the loader that creates containers and sandboxes.
+ l *Loader
+}
+
+// StartRoot will start the root container process.
+func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
+ log.Debugf("containerManager.StartRoot %q", *cid)
+ // Tell the root container to start and wait for the result.
+ cm.startChan <- struct{}{}
+ if err := <-cm.startResultChan; err != nil {
+ return fmt.Errorf("starting sandbox: %v", err)
+ }
+ return nil
+}
+
+// Processes retrieves information about processes running in the sandbox.
+func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error {
+ log.Debugf("containerManager.Processes: %q", *cid)
+ return control.Processes(cm.l.k, *cid, out)
+}
+
+// Create creates a container within a sandbox.
+func (cm *containerManager) Create(cid *string, _ *struct{}) error {
+ log.Debugf("containerManager.Create: %q", *cid)
+ return cm.l.createContainer(*cid)
+}
+
+// StartArgs contains arguments to the Start method.
+type StartArgs struct {
+ // Spec is the spec of the container to start.
+ Spec *specs.Spec
+
+ // Config is the runsc-specific configuration for the sandbox.
+ Conf *Config
+
+ // CID is the ID of the container to start.
+ CID string
+
+ // FilePayload contains, in order:
+ // * stdin, stdout, and stderr.
+ // * the file descriptor over which the sandbox will
+ // request files from its root filesystem.
+ urpc.FilePayload
+}
+
+// Start runs a created container within a sandbox.
+func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
+ log.Debugf("containerManager.Start: %+v", args)
+
+ // Validate arguments.
+ if args == nil {
+ return errors.New("start missing arguments")
+ }
+ if args.Spec == nil {
+ return errors.New("start arguments missing spec")
+ }
+ if args.Conf == nil {
+ return errors.New("start arguments missing config")
+ }
+ if args.CID == "" {
+ return errors.New("start argument missing container ID")
+ }
+ // Prevent CIDs containing ".." from confusing the sentry when creating
+ // /containers/<cid> directory.
+ // TODO(b/129293409): Once we have multiple independent roots, this
+ // check won't be necessary.
+ if path.Clean(args.CID) != args.CID {
+ return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
+ }
+ if len(args.FilePayload.Files) < 4 {
+ return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
+ }
+
+ err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+ if err != nil {
+ log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
+ return err
+ }
+ log.Debugf("Container %q started", args.CID)
+
+ return nil
+}
+
+// Destroy stops a container if it is still running and cleans up its
+// filesystem.
+func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
+ log.Debugf("containerManager.destroy %q", *cid)
+ return cm.l.destroyContainer(*cid)
+}
+
+// ExecuteAsync starts running a command on a created or running sandbox. It
+// returns the PID of the new process.
+func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
+ log.Debugf("containerManager.ExecuteAsync: %+v", args)
+ tgid, err := cm.l.executeAsync(args)
+ if err != nil {
+ log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err)
+ return err
+ }
+ *pid = int32(tgid)
+ return nil
+}
+
+// Checkpoint pauses a sandbox and saves its state.
+func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
+ log.Debugf("containerManager.Checkpoint")
+ state := control.State{
+ Kernel: cm.l.k,
+ Watchdog: cm.l.watchdog,
+ }
+ return state.Save(o, nil)
+}
+
+// Pause suspends a container.
+func (cm *containerManager) Pause(_, _ *struct{}) error {
+ log.Debugf("containerManager.Pause")
+ cm.l.k.Pause()
+ return nil
+}
+
+// RestoreOpts contains options related to restoring a container's file system.
+type RestoreOpts struct {
+ // FilePayload contains the state file to be restored, followed by the
+ // platform device file if necessary.
+ urpc.FilePayload
+
+ // SandboxID contains the ID of the sandbox.
+ SandboxID string
+}
+
+// Restore loads a container from a statefile.
+// The container's current kernel is destroyed, a restore environment is
+// created, and the kernel is recreated with the restore state file. The
+// container then sends the signal to start.
+func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
+ log.Debugf("containerManager.Restore")
+
+ var specFile, deviceFile *os.File
+ switch numFiles := len(o.FilePayload.Files); numFiles {
+ case 2:
+ // The device file is donated to the platform.
+ // Can't take ownership away from os.File. dup them to get a new FD.
+ fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+ if err != nil {
+ return fmt.Errorf("failed to dup file: %v", err)
+ }
+ deviceFile = os.NewFile(uintptr(fd), "platform device")
+ fallthrough
+ case 1:
+ specFile = o.FilePayload.Files[0]
+ case 0:
+ return fmt.Errorf("at least one file must be passed to Restore")
+ default:
+ return fmt.Errorf("at most two files may be passed to Restore")
+ }
+
+ networkStack := cm.l.k.NetworkStack()
+ // Destroy the old kernel and create a new kernel.
+ cm.l.k.Pause()
+ cm.l.k.Destroy()
+
+ p, err := createPlatform(cm.l.conf, deviceFile)
+ if err != nil {
+ return fmt.Errorf("creating platform: %v", err)
+ }
+ k := &kernel.Kernel{
+ Platform: p,
+ }
+ mf, err := createMemoryFile()
+ if err != nil {
+ return fmt.Errorf("creating memory file: %v", err)
+ }
+ k.SetMemoryFile(mf)
+ cm.l.k = k
+
+ // Set up the restore environment.
+ fds := &fdDispenser{fds: cm.l.goferFDs}
+ renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
+ if err != nil {
+ return fmt.Errorf("creating RestoreEnvironment: %v", err)
+ }
+ fs.SetRestoreEnvironment(*renv)
+
+ // Prepare to load from the state file.
+ if eps, ok := networkStack.(*epsocket.Stack); ok {
+ stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
+ }
+ info, err := specFile.Stat()
+ if err != nil {
+ return err
+ }
+ if info.Size() == 0 {
+ return fmt.Errorf("file cannot be empty")
+ }
+
+ // Load the state.
+ loadOpts := state.LoadOpts{Source: specFile}
+ if err := loadOpts.Load(k, networkStack); err != nil {
+ return err
+ }
+
+ // Set timekeeper.
+ k.Timekeeper().SetClocks(time.NewCalibratedClocks())
+
+ // Since we have a new kernel we also must make a new watchdog.
+ watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+
+ // Change the loader fields to reflect the changes made when restoring.
+ cm.l.k = k
+ cm.l.watchdog = watchdog
+ cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+ cm.l.restore = true
+
+ // Reinitialize the sandbox ID and processes map. Note that it doesn't
+ // restore the state of multiple containers, nor exec processes.
+ cm.l.sandboxID = o.SandboxID
+ cm.l.mu.Lock()
+ eid := execID{cid: o.SandboxID}
+ cm.l.processes = map[execID]*execProcess{
+ eid: {
+ tg: cm.l.k.GlobalInit(),
+ },
+ }
+ cm.l.mu.Unlock()
+
+ // Tell the root container to start and wait for the result.
+ cm.startChan <- struct{}{}
+ if err := <-cm.startResultChan; err != nil {
+ return fmt.Errorf("starting sandbox: %v", err)
+ }
+
+ return nil
+}
+
+// Resume unpauses a container.
+func (cm *containerManager) Resume(_, _ *struct{}) error {
+ log.Debugf("containerManager.Resume")
+ cm.l.k.Unpause()
+ return nil
+}
+
+// Wait waits for the init process in the given container.
+func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
+ log.Debugf("containerManager.Wait")
+ err := cm.l.waitContainer(*cid, waitStatus)
+ log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err)
+ return err
+}
+
+// WaitPIDArgs are arguments to the WaitPID method.
+type WaitPIDArgs struct {
+ // PID is the PID in the container's PID namespace.
+ PID int32
+
+ // CID is the container ID.
+ CID string
+
+ // ClearStatus determines whether the exit status of the process should
+ // be cleared when WaitPID returns.
+ ClearStatus bool
+}
+
+// WaitPID waits for the process with PID 'pid' in the sandbox.
+func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
+ log.Debugf("containerManager.Wait")
+ return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
+}
+
+// SignalDeliveryMode enumerates different signal delivery modes.
+type SignalDeliveryMode int
+
+const (
+ // DeliverToProcess delivers the signal to the container process with
+ // the specified PID. If PID is 0, then the container init process is
+ // signaled.
+ DeliverToProcess SignalDeliveryMode = iota
+
+ // DeliverToAllProcesses delivers the signal to all processes in the
+ // container. PID must be 0.
+ DeliverToAllProcesses
+
+ // DeliverToForegroundProcessGroup delivers the signal to the
+ // foreground process group in the same TTY session as the specified
+ // process. If PID is 0, then the signal is delivered to the foreground
+ // process group for the TTY for the init process.
+ DeliverToForegroundProcessGroup
+)
+
+func (s SignalDeliveryMode) String() string {
+ switch s {
+ case DeliverToProcess:
+ return "Process"
+ case DeliverToAllProcesses:
+ return "All"
+ case DeliverToForegroundProcessGroup:
+ return "Foreground Process Group"
+ }
+ return fmt.Sprintf("unknown signal delivery mode: %d", s)
+}
+
+// SignalArgs are arguments to the Signal method.
+type SignalArgs struct {
+ // CID is the container ID.
+ CID string
+
+ // Signo is the signal to send to the process.
+ Signo int32
+
+ // PID is the process ID in the given container that will be signaled.
+ // If 0, the root container will be signalled.
+ PID int32
+
+ // Mode is the signal delivery mode.
+ Mode SignalDeliveryMode
+}
+
+// Signal sends a signal to one or more processes in a container. If args.PID
+// is 0, then the container init process is used. Depending on the
+// args.SignalDeliveryMode option, the signal may be sent directly to the
+// indicated process, to all processes in the container, or to the foreground
+// process group.
+func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+ log.Debugf("containerManager.Signal %+v", args)
+ return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
+}
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
new file mode 100644
index 000000000..79f7387ac
--- /dev/null
+++ b/runsc/boot/debug.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+type debug struct {
+}
+
+// Stacks collects all sandbox stacks and copies them to 'stacks'.
+func (*debug) Stacks(_ *struct{}, stacks *string) error {
+ buf := log.Stacks(true)
+ *stacks = string(buf)
+ return nil
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
new file mode 100644
index 000000000..ffd99f5e9
--- /dev/null
+++ b/runsc/boot/events.go
@@ -0,0 +1,81 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// Event struct for encoding the event data to JSON. Corresponds to runc's
+// main.event struct.
+type Event struct {
+ Type string `json:"type"`
+ ID string `json:"id"`
+ Data interface{} `json:"data,omitempty"`
+}
+
+// Stats is the runc specific stats structure for stability when encoding and
+// decoding stats.
+type Stats struct {
+ Memory Memory `json:"memory"`
+ Pids Pids `json:"pids"`
+}
+
+// Pids contains stats on processes.
+type Pids struct {
+ Current uint64 `json:"current,omitempty"`
+ Limit uint64 `json:"limit,omitempty"`
+}
+
+// MemoryEntry contains stats on a kind of memory.
+type MemoryEntry struct {
+ Limit uint64 `json:"limit"`
+ Usage uint64 `json:"usage,omitempty"`
+ Max uint64 `json:"max,omitempty"`
+ Failcnt uint64 `json:"failcnt"`
+}
+
+// Memory contains stats on memory.
+type Memory struct {
+ Cache uint64 `json:"cache,omitempty"`
+ Usage MemoryEntry `json:"usage,omitempty"`
+ Swap MemoryEntry `json:"swap,omitempty"`
+ Kernel MemoryEntry `json:"kernel,omitempty"`
+ KernelTCP MemoryEntry `json:"kernelTCP,omitempty"`
+ Raw map[string]uint64 `json:"raw,omitempty"`
+}
+
+// Event gets the events from the container.
+func (cm *containerManager) Event(_ *struct{}, out *Event) error {
+ stats := &Stats{}
+ stats.populateMemory(cm.l.k)
+ stats.populatePIDs(cm.l.k)
+ *out = Event{Type: "stats", Data: stats}
+ return nil
+}
+
+func (s *Stats) populateMemory(k *kernel.Kernel) {
+ mem := k.MemoryFile()
+ mem.UpdateUsage()
+ _, totalUsage := usage.MemoryAccounting.Copy()
+ s.Memory.Usage = MemoryEntry{
+ Usage: totalUsage,
+ }
+}
+
+func (s *Stats) populatePIDs(k *kernel.Kernel) {
+ s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups()))
+}
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
new file mode 100644
index 000000000..4e428b49c
--- /dev/null
+++ b/runsc/boot/fds.go
@@ -0,0 +1,89 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// createFDMap creates an FD map that contains stdin, stdout, and stderr. If
+// console is true, then ioctl calls will be passed through to the host FD.
+// Upon success, createFDMap dups then closes stdioFDs.
+func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+ if len(stdioFDs) != 3 {
+ return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+ }
+
+ fdm := k.NewFDMap()
+ defer fdm.DecRef()
+ mounter := fs.FileOwnerFromContext(ctx)
+
+ // Maps sandbox FD to host FD.
+ fdMap := map[int]int{
+ 0: stdioFDs[0],
+ 1: stdioFDs[1],
+ 2: stdioFDs[2],
+ }
+
+ var ttyFile *fs.File
+ for appFD, hostFD := range fdMap {
+ var appFile *fs.File
+
+ if console && appFD < 3 {
+ // Import the file as a host TTY file.
+ if ttyFile == nil {
+ var err error
+ appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */)
+ if err != nil {
+ return nil, err
+ }
+ defer appFile.DecRef()
+
+ // Remember this in the TTY file, as we will
+ // use it for the other stdio FDs.
+ ttyFile = appFile
+ } else {
+ // Re-use the existing TTY file, as all three
+ // stdio FDs must point to the same fs.File in
+ // order to share TTY state, specifically the
+ // foreground process group id.
+ appFile = ttyFile
+ }
+ } else {
+ // Import the file as a regular host file.
+ var err error
+ appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */)
+ if err != nil {
+ return nil, err
+ }
+ defer appFile.DecRef()
+ }
+
+ // Add the file to the FD map.
+ if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+ return nil, err
+ }
+ }
+
+ fdm.IncRef()
+ return fdm, nil
+}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
new file mode 100644
index 000000000..652da1cef
--- /dev/null
+++ b/runsc/boot/filter/config.go
@@ -0,0 +1,493 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+ "os"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
+)
+
+// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
+var allowedSyscalls = seccomp.SyscallRules{
+ syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(linux.ARCH_GET_FS)},
+ {seccomp.AllowValue(linux.ARCH_SET_FS)},
+ },
+ syscall.SYS_CLOCK_GETTIME: {},
+ syscall.SYS_CLONE: []seccomp.Rule{
+ {
+ seccomp.AllowValue(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ },
+ },
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
+ syscall.SYS_EPOLL_CREATE1: {},
+ syscall.SYS_EPOLL_CTL: {},
+ syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_EVENTFD2: []seccomp.Rule{
+ {
+ seccomp.AllowValue(0),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_EXIT: {},
+ syscall.SYS_EXIT_GROUP: {},
+ syscall.SYS_FALLOCATE: {},
+ syscall.SYS_FCHMOD: {},
+ syscall.SYS_FCNTL: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_GETFL),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_SETFL),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_GETFD),
+ },
+ },
+ syscall.SYS_FSTAT: {},
+ syscall.SYS_FSYNC: {},
+ syscall.SYS_FTRUNCATE: {},
+ syscall.SYS_FUTEX: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_GETPID: {},
+ unix.SYS_GETRANDOM: {},
+ syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_DOMAIN),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_TYPE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_ERROR),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_SNDBUF),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_REUSEADDR),
+ },
+ },
+ syscall.SYS_GETTID: {},
+ syscall.SYS_GETTIMEOFDAY: {},
+ // SYS_IOCTL is needed for terminal support, but we only allow
+ // setting/getting termios and winsize.
+ syscall.SYS_IOCTL: []seccomp.Rule{
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TCGETS),
+ seccomp.AllowAny{}, /* termios struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TCSETS),
+ seccomp.AllowAny{}, /* termios struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TCSETSF),
+ seccomp.AllowAny{}, /* termios struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TCSETSW),
+ seccomp.AllowAny{}, /* termios struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TIOCSWINSZ),
+ seccomp.AllowAny{}, /* winsize struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TIOCGWINSZ),
+ seccomp.AllowAny{}, /* winsize struct */
+ },
+ },
+ syscall.SYS_LSEEK: {},
+ syscall.SYS_MADVISE: {},
+ syscall.SYS_MINCORE: {},
+ syscall.SYS_MMAP: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_SHARED),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+ },
+ },
+ syscall.SYS_MPROTECT: {},
+ syscall.SYS_MUNMAP: {},
+ syscall.SYS_NANOSLEEP: {},
+ syscall.SYS_POLL: {},
+ syscall.SYS_PREAD64: {},
+ syscall.SYS_PWRITE64: {},
+ syscall.SYS_READ: {},
+ syscall.SYS_RECVMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+ },
+ },
+ syscall.SYS_RECVMMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
+ seccomp.AllowValue(syscall.MSG_DONTWAIT),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_RESTART_SYSCALL: {},
+ syscall.SYS_RT_SIGACTION: {},
+ syscall.SYS_RT_SIGPROCMASK: {},
+ syscall.SYS_RT_SIGRETURN: {},
+ syscall.SYS_SCHED_YIELD: {},
+ syscall.SYS_SENDMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+ },
+ },
+ syscall.SYS_SETITIMER: {},
+ syscall.SYS_SHUTDOWN: []seccomp.Rule{
+ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+ },
+ syscall.SYS_SIGALTSTACK: {},
+ syscall.SYS_SYNC_FILE_RANGE: {},
+ syscall.SYS_TGKILL: []seccomp.Rule{
+ {
+ seccomp.AllowValue(uint64(os.Getpid())),
+ },
+ },
+ syscall.SYS_WRITE: {},
+ // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
+ // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
+ // option is enabled for a packet socket.
+ syscall.SYS_WRITEV: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(2),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(3),
+ },
+ },
+}
+
+// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
+func hostInetFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_ACCEPT4: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ },
+ },
+ syscall.SYS_BIND: {},
+ syscall.SYS_CONNECT: {},
+ syscall.SYS_GETPEERNAME: {},
+ syscall.SYS_GETSOCKNAME: {},
+ syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_V6ONLY),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_ERROR),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_KEEPALIVE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_SNDBUF),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_RCVBUF),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_REUSEADDR),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_TYPE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_LINGER),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_TCP),
+ seccomp.AllowValue(syscall.TCP_NODELAY),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_TCP),
+ seccomp.AllowValue(syscall.TCP_INFO),
+ },
+ },
+ syscall.SYS_IOCTL: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.TIOCOUTQ),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.TIOCINQ),
+ },
+ },
+ syscall.SYS_LISTEN: {},
+ syscall.SYS_READV: {},
+ syscall.SYS_RECVFROM: {},
+ syscall.SYS_RECVMSG: {},
+ syscall.SYS_SENDMSG: {},
+ syscall.SYS_SENDTO: {},
+ syscall.SYS_SETSOCKOPT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_V6ONLY),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_SNDBUF),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_RCVBUF),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_REUSEADDR),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_TCP),
+ seccomp.AllowValue(syscall.TCP_NODELAY),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ },
+ syscall.SYS_SHUTDOWN: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SHUT_RD),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SHUT_WR),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SHUT_RDWR),
+ },
+ },
+ syscall.SYS_SOCKET: []seccomp.Rule{
+ {
+ seccomp.AllowValue(syscall.AF_INET),
+ seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowValue(syscall.AF_INET),
+ seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowValue(syscall.AF_INET6),
+ seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowValue(syscall.AF_INET6),
+ seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_WRITEV: {},
+ }
+}
+
+// ptraceFilters returns syscalls made exclusively by the ptrace platform.
+func ptraceFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ unix.SYS_GETCPU: {},
+ unix.SYS_SCHED_SETAFFINITY: {},
+ syscall.SYS_PTRACE: {},
+ syscall.SYS_TGKILL: {},
+ syscall.SYS_WAIT4: {},
+ }
+}
+
+// kvmFilters returns syscalls made exclusively by the KVM platform.
+func kvmFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_ARCH_PRCTL: {},
+ syscall.SYS_IOCTL: {},
+ syscall.SYS_MMAP: {},
+ syscall.SYS_RT_SIGSUSPEND: {},
+ syscall.SYS_RT_SIGTIMEDWAIT: {},
+ 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host.
+ }
+}
+
+func controlServerFilters(fd int) seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_ACCEPT: []seccomp.Rule{
+ {
+ seccomp.AllowValue(fd),
+ },
+ },
+ syscall.SYS_LISTEN: []seccomp.Rule{
+ {
+ seccomp.AllowValue(fd),
+ seccomp.AllowValue(16 /* unet.backlog */),
+ },
+ },
+ syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_PEERCRED),
+ },
+ },
+ }
+}
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_OPENAT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+ },
+ },
+ }
+}
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
new file mode 100644
index 000000000..5c5ec4e06
--- /dev/null
+++ b/runsc/boot/filter/extra_filters.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go intrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() seccomp.SyscallRules {
+ return nil
+}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
new file mode 100644
index 000000000..ac5a0f1aa
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -0,0 +1,32 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+ Report("MSAN is enabled: syscall filters less restrictive!")
+ return seccomp.SyscallRules{
+ syscall.SYS_SCHED_GETAFFINITY: {},
+ syscall.SYS_SET_ROBUST_LIST: {},
+ }
+}
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
new file mode 100644
index 000000000..ba3c1ce87
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+ Report("TSAN is enabled: syscall filters less restrictive!")
+ return seccomp.SyscallRules{
+ syscall.SYS_BRK: {},
+ syscall.SYS_CLONE: {},
+ syscall.SYS_FUTEX: {},
+ syscall.SYS_MMAP: {},
+ syscall.SYS_MUNLOCK: {},
+ syscall.SYS_NANOSLEEP: {},
+ syscall.SYS_OPEN: {},
+ syscall.SYS_SET_ROBUST_LIST: {},
+ // Used within glibc's malloc.
+ syscall.SYS_TIME: {},
+ }
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
new file mode 100644
index 000000000..17479e0dd
--- /dev/null
+++ b/runsc/boot/filter/filter.go
@@ -0,0 +1,71 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the sandbox is allowed to make
+// to the host, and installs seccomp filters to prevent prohibited
+// syscalls in case it's compromised.
+package filter
+
+import (
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+)
+
+// Options are seccomp filter related options.
+type Options struct {
+ Platform platform.Platform
+ HostNetwork bool
+ ProfileEnable bool
+ ControllerFD int
+}
+
+// Install installs seccomp filters for based on the given platform.
+func Install(opt Options) error {
+ s := allowedSyscalls
+ s.Merge(controlServerFilters(opt.ControllerFD))
+
+ // Set of additional filters used by -race and -msan. Returns empty
+ // when not enabled.
+ s.Merge(instrumentationFilters())
+
+ if opt.HostNetwork {
+ Report("host networking enabled: syscall filters less restrictive!")
+ s.Merge(hostInetFilters())
+ }
+ if opt.ProfileEnable {
+ Report("profile enabled: syscall filters less restrictive!")
+ s.Merge(profileFilters())
+ }
+
+ switch p := opt.Platform.(type) {
+ case *ptrace.PTrace:
+ s.Merge(ptraceFilters())
+ case *kvm.KVM:
+ s.Merge(kvmFilters())
+ default:
+ return fmt.Errorf("unknown platform type %T", p)
+ }
+
+ return seccomp.Install(s)
+}
+
+// Report writes a warning message to the log.
+func Report(msg string) {
+ log.Warningf("*** SECCOMP WARNING: %s", msg)
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
new file mode 100644
index 000000000..4b1557b9a
--- /dev/null
+++ b/runsc/boot/fs.go
@@ -0,0 +1,774 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "path"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+
+ // Include filesystem types that OCI spec might mount.
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ // Filesystem name for 9p gofer mounts.
+ rootFsName = "9p"
+
+ // Device name for root mount.
+ rootDevice = "9pfs-/"
+
+ // ChildContainersDir is the directory where child container root
+ // filesystems are mounted.
+ ChildContainersDir = "/__runsc_containers__"
+
+ // Filesystems that runsc supports.
+ bind = "bind"
+ devpts = "devpts"
+ devtmpfs = "devtmpfs"
+ proc = "proc"
+ sysfs = "sysfs"
+ tmpfs = "tmpfs"
+ nonefs = "none"
+)
+
+type fdDispenser struct {
+ fds []int
+}
+
+func (f *fdDispenser) remove() int {
+ if f.empty() {
+ panic("fdDispenser out of fds")
+ }
+ rv := f.fds[0]
+ f.fds = f.fds[1:]
+ return rv
+}
+
+func (f *fdDispenser) empty() bool {
+ return len(f.fds) == 0
+}
+
+func adjustDirentCache(k *kernel.Kernel) error {
+ var hl syscall.Rlimit
+ if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+ return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
+ }
+ if int64(hl.Cur) != syscall.RLIM_INFINITY {
+ newSize := hl.Cur / 2
+ if newSize < gofer.DefaultDirentCacheSize {
+ log.Infof("Setting gofer dirent cache size to %d", newSize)
+ gofer.DefaultDirentCacheSize = newSize
+ k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+ }
+ }
+ return nil
+}
+
+// setupRootContainerFS creates a mount namespace containing the root filesystem
+// and all mounts. 'rootCtx' is used to walk directories to find mount points.
+// 'setMountNS' is called after namespace is created. It must set the mount NS
+// to 'rootCtx'.
+func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error {
+ mounts := compileMounts(spec)
+
+ // Create a tmpfs mount where we create and mount a root filesystem for
+ // each child container.
+ mounts = append(mounts, specs.Mount{
+ Type: tmpfs,
+ Destination: ChildContainersDir,
+ })
+
+ fds := &fdDispenser{fds: goferFDs}
+ rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
+ if err != nil {
+ return fmt.Errorf("creating root mount: %v", err)
+ }
+ mns, err := fs.NewMountNamespace(userCtx, rootInode)
+ if err != nil {
+ return fmt.Errorf("creating root mount namespace: %v", err)
+ }
+ setMountNS(mns)
+
+ root := mns.Root()
+ defer root.DecRef()
+ return mountSubmounts(rootCtx, conf, mns, root, mounts, fds)
+}
+
+// compileMounts returns the supported mounts from the mount spec, adding any
+// mandatory mounts that are required by the OCI specification.
+func compileMounts(spec *specs.Spec) []specs.Mount {
+ // Keep track of whether proc and sys were mounted.
+ var procMounted, sysMounted bool
+ var mounts []specs.Mount
+
+ // Always mount /dev.
+ mounts = append(mounts, specs.Mount{
+ Type: devtmpfs,
+ Destination: "/dev",
+ })
+
+ mounts = append(mounts, specs.Mount{
+ Type: devpts,
+ Destination: "/dev/pts",
+ })
+
+ // Mount all submounts from the spec.
+ for _, m := range spec.Mounts {
+ if !specutils.IsSupportedDevMount(m) {
+ log.Warningf("ignoring dev mount at %q", m.Destination)
+ continue
+ }
+ mounts = append(mounts, m)
+ switch filepath.Clean(m.Destination) {
+ case "/proc":
+ procMounted = true
+ case "/sys":
+ sysMounted = true
+ }
+ }
+
+ // Mount proc and sys even if the user did not ask for it, as the spec
+ // says we SHOULD.
+ var mandatoryMounts []specs.Mount
+ if !procMounted {
+ mandatoryMounts = append(mandatoryMounts, specs.Mount{
+ Type: proc,
+ Destination: "/proc",
+ })
+ }
+ if !sysMounted {
+ mandatoryMounts = append(mandatoryMounts, specs.Mount{
+ Type: sysfs,
+ Destination: "/sys",
+ })
+ }
+
+ // The mandatory mounts should be ordered right after the root, in case
+ // there are submounts of these mandatory mounts already in the spec.
+ mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
+
+ return mounts
+}
+
+// createRootMount creates the root filesystem.
+func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
+ // First construct the filesystem from the spec.Root.
+ mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay}
+
+ var (
+ rootInode *fs.Inode
+ err error
+ )
+
+ fd := fds.remove()
+ log.Infof("Mounting root over 9P, ioFD: %d", fd)
+ p9FS := mustFindFilesystem("9p")
+ opts := p9MountOptions(fd, conf.FileAccess)
+ rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating root mount point: %v", err)
+ }
+
+ // We need to overlay the root on top of a ramfs with stub directories
+ // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
+ // mounted even if they are not in the spec.
+ submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
+ rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+ if err != nil {
+ return nil, fmt.Errorf("adding submount overlay: %v", err)
+ }
+
+ if conf.Overlay && !spec.Root.Readonly {
+ log.Debugf("Adding overlay on top of root mount")
+ // Overlay a tmpfs filesystem on top of the root.
+ rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ log.Infof("Mounted %q to %q type root", spec.Root.Path, "/")
+ return rootInode, nil
+}
+
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+ // Upper layer uses the same flags as lower, but it must be read-write.
+ lowerFlags.ReadOnly = false
+
+ tmpFS := mustFindFilesystem("tmpfs")
+ if !fs.IsDir(lower.StableAttr) {
+ // Create overlay on top of mount file, e.g. /etc/hostname.
+ msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
+ return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
+ }
+
+ // Create overlay on top of mount dir.
+ upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
+ }
+ return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
+ var (
+ fsName string
+ opts []string
+ useOverlay bool
+ err error
+ )
+
+ switch m.Type {
+ case devpts, devtmpfs, proc, sysfs:
+ fsName = m.Type
+ case nonefs:
+ fsName = sysfs
+ case tmpfs:
+ fsName = m.Type
+
+ // tmpfs has some extra supported options that we must pass through.
+ opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+
+ case bind:
+ fd := fds.remove()
+ fsName = "9p"
+ // Non-root bind mounts are always shared.
+ opts = p9MountOptions(fd, FileAccessShared)
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+ default:
+ // TODO(nlacasse): Support all the mount types and make this a
+ // fatal error. Most applications will "just work" without
+ // them, so this is a warning for now.
+ // we do not support.
+ log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ }
+ return fsName, opts, useOverlay, err
+}
+
+func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error {
+ for _, m := range mounts {
+ if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil {
+ return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ }
+ }
+
+ if err := mountTmp(ctx, conf, mns, root, mounts); err != nil {
+ return fmt.Errorf("mount submount %q: %v", "tmp", err)
+ }
+
+ if !fds.empty() {
+ return fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
+ }
+ return nil
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+
+ // Return the error or nil that corresponds to the default case in getMountNameAndOptions.
+ if err != nil {
+ return err
+ }
+ if fsName == "" {
+ return nil
+ }
+
+ // All filesystem names should have been mapped to something we know.
+ filesystem := mustFindFilesystem(fsName)
+
+ mf := mountFlags(m.Options)
+ if useOverlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ mf.ReadOnly = true
+ }
+
+ inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+ }
+
+ // If there are submounts, we need to overlay the mount on top of a
+ // ramfs with stub directories for submount paths.
+ submounts := subtargets(m.Destination, mounts)
+ if len(submounts) > 0 {
+ log.Infof("Adding submount overlay over %q", m.Destination)
+ inode, err = addSubmountOverlay(ctx, inode, submounts)
+ if err != nil {
+ return fmt.Errorf("adding submount overlay: %v", err)
+ }
+ }
+
+ if useOverlay {
+ log.Debugf("Adding overlay on top of mount %q", m.Destination)
+ inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+ if err != nil {
+ return err
+ }
+ }
+
+ maxTraversals := uint(0)
+ dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+ }
+ defer dirent.DecRef()
+ if err := mns.Mount(ctx, dirent, inode); err != nil {
+ return fmt.Errorf("mount %q error: %v", m.Destination, err)
+ }
+
+ log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+ return nil
+}
+
+// p9MountOptions creates a slice of options for a p9 mount.
+func p9MountOptions(fd int, fa FileAccessType) []string {
+ opts := []string{
+ "trans=fd",
+ "rfdno=" + strconv.Itoa(fd),
+ "wfdno=" + strconv.Itoa(fd),
+ "privateunixsocket=true",
+ }
+ if fa == FileAccessShared {
+ opts = append(opts, "cache=remote_revalidating")
+ }
+ return opts
+}
+
+// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
+// keys.
+func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
+ var out []string
+ for _, o := range opts {
+ kv := strings.Split(o, "=")
+ switch len(kv) {
+ case 1:
+ if specutils.ContainsStr(allowedKeys, o) {
+ out = append(out, o)
+ continue
+ }
+ log.Warningf("ignoring unsupported key %q", kv)
+ case 2:
+ if specutils.ContainsStr(allowedKeys, kv[0]) {
+ out = append(out, o)
+ continue
+ }
+ log.Warningf("ignoring unsupported key %q", kv[0])
+ default:
+ return nil, fmt.Errorf("invalid option %q", o)
+ }
+ }
+ return out, nil
+}
+
+// mountDevice returns a device string based on the fs type and target
+// of the mount.
+func mountDevice(m specs.Mount) string {
+ if m.Type == bind {
+ // Make a device string that includes the target, which is consistent across
+ // S/R and uniquely identifies the connection.
+ return "9pfs-" + m.Destination
+ }
+ // All other fs types use device "none".
+ return "none"
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
+ fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+
+ // Return the error or nil that corresponds to the default case in getMountNameAndOptions.
+ if err != nil {
+ return err
+ }
+ // TODO(nlacasse): Fix this when we support all the mount types and
+ // make this a fatal error.
+ if fsName == "" {
+ return nil
+ }
+
+ newMount := fs.MountArgs{
+ Dev: mountDevice(m),
+ Flags: mountFlags(m.Options),
+ DataString: strings.Join(opts, ","),
+ }
+ if useOverlay {
+ newMount.Flags.ReadOnly = true
+ }
+ renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+ log.Infof("Added mount at %q: %+v", fsName, newMount)
+ return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
+// to the environment.
+func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
+ renv := &fs.RestoreEnvironment{
+ MountSources: make(map[string][]fs.MountArgs),
+ }
+
+ // Add root mount.
+ fd := fds.remove()
+ opts := p9MountOptions(fd, conf.FileAccess)
+
+ mf := fs.MountSourceFlags{}
+ if spec.Root.Readonly || conf.Overlay {
+ mf.ReadOnly = true
+ }
+
+ rootMount := fs.MountArgs{
+ Dev: rootDevice,
+ Flags: mf,
+ DataString: strings.Join(opts, ","),
+ }
+ renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+
+ // Add submounts.
+ var tmpMounted bool
+ for _, m := range compileMounts(spec) {
+ if err := addRestoreMount(conf, renv, m, fds); err != nil {
+ return nil, err
+ }
+ if filepath.Clean(m.Destination) == "/tmp" {
+ tmpMounted = true
+ }
+ }
+
+ // TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+ if !tmpMounted {
+ tmpMount := specs.Mount{
+ Type: tmpfs,
+ Destination: "/tmp",
+ }
+ if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil {
+ return nil, err
+ }
+ }
+
+ return renv, nil
+}
+
+func mountFlags(opts []string) fs.MountSourceFlags {
+ mf := fs.MountSourceFlags{}
+ for _, o := range opts {
+ switch o {
+ case "rw":
+ mf.ReadOnly = false
+ case "ro":
+ mf.ReadOnly = true
+ case "noatime":
+ mf.NoAtime = true
+ case "noexec":
+ mf.NoExec = true
+ default:
+ log.Warningf("ignoring unknown mount option %q", o)
+ }
+ }
+ return mf
+}
+
+func mustFindFilesystem(name string) fs.Filesystem {
+ fs, ok := fs.FindFilesystem(name)
+ if !ok {
+ panic(fmt.Sprintf("could not find filesystem %q", name))
+ }
+ return fs
+}
+
+// addSubmountOverlay overlays the inode over a ramfs tree containing the given
+// paths.
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+ msrc := fs.NewPseudoMountSource()
+ mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
+ if err != nil {
+ return nil, fmt.Errorf("creating mount tree: %v", err)
+ }
+ overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+ if err != nil {
+ return nil, fmt.Errorf("adding mount overlay: %v", err)
+ }
+ return overlayInode, err
+}
+
+// subtargets takes a set of Mounts and returns only the targets that are
+// children of the given root. The returned paths are relative to the root.
+func subtargets(root string, mnts []specs.Mount) []string {
+ var targets []string
+ for _, mnt := range mnts {
+ if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
+ targets = append(targets, relPath)
+ }
+ }
+ return targets
+}
+
+// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
+// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
+func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+ ctx := procArgs.NewContext(k)
+
+ // Create the FD map, which will set stdin, stdout, and stderr. If console
+ // is true, then ioctl calls will be passed through to the host fd.
+ fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
+ if err != nil {
+ return fmt.Errorf("importing fds: %v", err)
+ }
+
+ // CreateProcess takes a reference on FDMap if successful. We
+ // won't need ours either way.
+ procArgs.FDMap = fdm
+
+ // Use root user to configure mounts. The current user might not have
+ // permission to do so.
+ rootProcArgs := kernel.CreateProcessArgs{
+ WorkingDirectory: "/",
+ Credentials: auth.NewRootCredentials(creds.UserNamespace),
+ Umask: 0022,
+ MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+ }
+ rootCtx := rootProcArgs.NewContext(k)
+
+ // If this is the root container, we also need to setup the root mount
+ // namespace.
+ mns := k.RootMountNamespace()
+ if mns == nil {
+ // Setup the root container.
+ return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) {
+ k.SetRootMountNamespace(mns)
+ })
+ }
+
+ // Setup a child container.
+ log.Infof("Creating new process in child container.")
+ globalRoot := mns.Root()
+ defer globalRoot.DecRef()
+
+ // Create mount point for the container's rootfs.
+ maxTraversals := uint(0)
+ contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
+ }
+ if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil {
+ return fmt.Errorf("create directory %q: %v", cid, err)
+ }
+ containerRoot, err := contDir.Walk(ctx, globalRoot, cid)
+ if err != nil {
+ return fmt.Errorf("walk to %q failed: %v", cid, err)
+ }
+ defer containerRoot.DecRef()
+
+ // Create the container's root filesystem mount.
+ fds := &fdDispenser{fds: goferFDs}
+ rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+ if err != nil {
+ return fmt.Errorf("creating filesystem for container: %v", err)
+ }
+
+ // Mount the container's root filesystem to the newly created mount point.
+ if err := mns.Mount(ctx, containerRoot, rootInode); err != nil {
+ return fmt.Errorf("mount container root: %v", err)
+ }
+
+ // We have to re-walk to the dirent to find the mounted
+ // directory. The old dirent is invalid at this point.
+ containerRoot, err = contDir.Walk(ctx, globalRoot, cid)
+ if err != nil {
+ return fmt.Errorf("find container mount point %q: %v", cid, err)
+ }
+ cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
+ defer cu.Clean()
+
+ log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
+
+ // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
+ procArgs.Root = containerRoot
+
+ // Mount all submounts.
+ mounts := compileMounts(spec)
+ if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil {
+ return err
+ }
+ cu.Release()
+ return nil
+}
+
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+ paths := fs.GetPath(procArgs.Envv)
+ exe := procArgs.Argv[0]
+ f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+ if err != nil {
+ return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+ }
+ procArgs.Filename = f
+ return nil
+}
+
+// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
+// given container and deleting the container root directory.
+func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
+ defer func() {
+ // Flushing dirent references triggers many async close
+ // operations. We must wait for those to complete before
+ // returning, otherwise the caller may kill the gofer before
+ // they complete, causing a cascade of failing RPCs.
+ //
+ // This must take place in the first deferred function, so that
+ // it runs after all the other deferred DecRef() calls in this
+ // function.
+ log.Infof("Waiting for async filesystem operations to complete")
+ fs.AsyncBarrier()
+ }()
+
+ // First get a reference to the container root directory.
+ mns := k.RootMountNamespace()
+ mnsRoot := mns.Root()
+ defer mnsRoot.DecRef()
+ containerRoot := path.Join(ChildContainersDir, cid)
+ maxTraversals := uint(0)
+ containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals)
+ if err == syserror.ENOENT {
+ // Container must have been destroyed already. That's fine.
+ return nil
+ }
+ if err != nil {
+ return fmt.Errorf("finding container root directory %q: %v", containerRoot, err)
+ }
+ defer containerRootDirent.DecRef()
+
+ // Iterate through all submounts and unmount them. We unmount lazily by
+ // setting detach=true, so we can unmount in any order.
+ mnt := mns.FindMount(containerRootDirent)
+ for _, m := range mns.AllMountsUnder(mnt) {
+ root := m.Root()
+ defer root.DecRef()
+
+ // Do a best-effort unmount by flushing the refs and unmount
+ // with "detach only = true". Unmount returns EINVAL when the mount point
+ // doesn't exist, i.e. it has already been unmounted.
+ log.Debugf("Unmounting container mount %q", root.BaseName())
+ root.Inode.MountSource.FlushDirentRefs()
+ if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
+ return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err)
+ }
+ }
+
+ // Get a reference to the parent directory and remove the root
+ // container directory.
+ maxTraversals = 0
+ containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("finding containers directory %q: %v", ChildContainersDir, err)
+ }
+ defer containersDirDirent.DecRef()
+ log.Debugf("Deleting container root %q", containerRoot)
+ if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
+ return fmt.Errorf("removing directory %q: %v", containerRoot, err)
+ }
+
+ return nil
+}
+
+// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+// 1. /tmp is mounted explictly: we should not override user's wish
+// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error {
+ for _, m := range mounts {
+ if filepath.Clean(m.Destination) == "/tmp" {
+ log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
+ return nil
+ }
+ }
+
+ maxTraversals := uint(0)
+ tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
+ switch err {
+ case nil:
+ // Found '/tmp' in filesystem, check if it's empty.
+ defer tmp.DecRef()
+ f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
+ if err != nil {
+ return err
+ }
+ defer f.DecRef()
+ serializer := &fs.CollectEntriesSerializer{}
+ if err := f.Readdir(ctx, serializer); err != nil {
+ return err
+ }
+ // If more than "." and ".." is found, skip internal tmpfs to prevent hiding
+ // existing files.
+ if len(serializer.Order) > 2 {
+ log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
+ return nil
+ }
+ log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
+ fallthrough
+
+ case syserror.ENOENT:
+ // No '/tmp' found (or fallthrough from above). Safe to mount internal
+ // tmpfs.
+ tmpMount := specs.Mount{
+ Type: tmpfs,
+ Destination: "/tmp",
+ // Sticky bit is added to prevent accidental deletion of files from
+ // another user. This is normally done for /tmp.
+ Options: []string{"mode=1777"},
+ }
+ return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts)
+
+ default:
+ return err
+ }
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
new file mode 100644
index 000000000..3364aa5e6
--- /dev/null
+++ b/runsc/boot/limits.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "sync"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// Mapping from linux resource names to limits.LimitType.
+var fromLinuxResource = map[string]limits.LimitType{
+ "RLIMIT_AS": limits.AS,
+ "RLIMIT_CORE": limits.Core,
+ "RLIMIT_CPU": limits.CPU,
+ "RLIMIT_DATA": limits.Data,
+ "RLIMIT_FSIZE": limits.FileSize,
+ "RLIMIT_LOCKS": limits.Locks,
+ "RLIMIT_MEMLOCK": limits.MemoryLocked,
+ "RLIMIT_MSGQUEUE": limits.MessageQueueBytes,
+ "RLIMIT_NICE": limits.Nice,
+ "RLIMIT_NOFILE": limits.NumberOfFiles,
+ "RLIMIT_NPROC": limits.ProcessCount,
+ "RLIMIT_RSS": limits.Rss,
+ "RLIMIT_RTPRIO": limits.RealTimePriority,
+ "RLIMIT_RTTIME": limits.Rttime,
+ "RLIMIT_SIGPENDING": limits.SignalsPending,
+ "RLIMIT_STACK": limits.Stack,
+}
+
+func findName(lt limits.LimitType) string {
+ for k, v := range fromLinuxResource {
+ if v == lt {
+ return k
+ }
+ }
+ return "unknown"
+}
+
+var defaults defs
+
+type defs struct {
+ mu sync.Mutex
+ set *limits.LimitSet
+ err error
+}
+
+func (d *defs) get() (*limits.LimitSet, error) {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ if d.err != nil {
+ return nil, d.err
+ }
+ if d.set == nil {
+ if err := d.initDefaults(); err != nil {
+ d.err = err
+ return nil, err
+ }
+ }
+ return d.set, nil
+}
+
+func (d *defs) initDefaults() error {
+ ls, err := limits.NewLinuxLimitSet()
+ if err != nil {
+ return err
+ }
+
+ // Set default limits based on what containers get by default, ex:
+ // $ docker run --rm debian prlimit
+ ls.SetUnchecked(limits.AS, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.Core, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.CPU, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
+ ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
+ ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
+ ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
+ ls.SetUnchecked(limits.ProcessCount, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.Rss, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.RealTimePriority, limits.Limit{Cur: 0, Max: 0})
+ ls.SetUnchecked(limits.Rttime, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0})
+ ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity})
+
+ // Read host limits that directly affect the sandbox and adjust the defaults
+ // based on them.
+ for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} {
+ var hl syscall.Rlimit
+ if err := syscall.Getrlimit(res, &hl); err != nil {
+ return err
+ }
+
+ lt, ok := limits.FromLinuxResource[res]
+ if !ok {
+ return fmt.Errorf("unknown rlimit type %v", res)
+ }
+ hostLimit := limits.Limit{
+ Cur: limits.FromLinux(hl.Cur),
+ Max: limits.FromLinux(hl.Max),
+ }
+
+ defaultLimit := ls.Get(lt)
+ if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur {
+ log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur)
+ }
+ if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max {
+ log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max)
+ ls.SetUnchecked(lt, hostLimit)
+ }
+ }
+
+ d.set = ls
+ return nil
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+ ls, err := defaults.get()
+ if err != nil {
+ return nil, err
+ }
+
+ // Then apply overwrites on top of defaults.
+ for _, rl := range spec.Process.Rlimits {
+ lt, ok := fromLinuxResource[rl.Type]
+ if !ok {
+ return nil, fmt.Errorf("unknown resource %q", rl.Type)
+ }
+ ls.SetUnchecked(lt, limits.Limit{
+ Cur: rl.Soft,
+ Max: rl.Hard,
+ })
+ }
+ return ls, nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
new file mode 100644
index 000000000..6ac6b94dd
--- /dev/null
+++ b/runsc/boot/loader.go
@@ -0,0 +1,954 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package boot loads the kernel and runs a container.
+package boot
+
+import (
+ "fmt"
+ mrand "math/rand"
+ "os"
+ "runtime"
+ "sync"
+ "sync/atomic"
+ "syscall"
+ gtime "time"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/rand"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
+ slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
+ "gvisor.googlesource.com/gvisor/runsc/boot/filter"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+
+ // Include supported socket providers.
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+)
+
+// Loader keeps state needed to start the kernel and run the container..
+type Loader struct {
+ // k is the kernel.
+ k *kernel.Kernel
+
+ // ctrl is the control server.
+ ctrl *controller
+
+ conf *Config
+
+ // console is set to true if terminal is enabled.
+ console bool
+
+ watchdog *watchdog.Watchdog
+
+ // stdioFDs contains stdin, stdout, and stderr.
+ stdioFDs []int
+
+ // goferFDs are the FDs that attach the sandbox to the gofers.
+ goferFDs []int
+
+ // spec is the base configuration for the root container.
+ spec *specs.Spec
+
+ // startSignalForwarding enables forwarding of signals to the sandboxed
+ // container. It should be called after the init process is loaded.
+ startSignalForwarding func() func()
+
+ // stopSignalForwarding disables forwarding of signals to the sandboxed
+ // container. It should be called when a sandbox is destroyed.
+ stopSignalForwarding func()
+
+ // restore is set to true if we are restoring a container.
+ restore bool
+
+ // rootProcArgs refers to the root sandbox init task.
+ rootProcArgs kernel.CreateProcessArgs
+
+ // sandboxID is the ID for the whole sandbox.
+ sandboxID string
+
+ // mu guards processes.
+ mu sync.Mutex
+
+ // processes maps containers init process and invocation of exec. Root
+ // processes are keyed with container ID and pid=0, while exec invocations
+ // have the corresponding pid set.
+ //
+ // processes is guardded by mu.
+ processes map[execID]*execProcess
+}
+
+// execID uniquely identifies a sentry process that is executed in a container.
+type execID struct {
+ cid string
+ pid kernel.ThreadID
+}
+
+// execProcess contains the thread group and host TTY of a sentry process.
+type execProcess struct {
+ // tg will be nil for containers that haven't started yet.
+ tg *kernel.ThreadGroup
+
+ // tty will be nil if the process is not attached to a terminal.
+ tty *host.TTYFileOperations
+}
+
+func init() {
+ // Initialize the random number generator.
+ mrand.Seed(gtime.Now().UnixNano())
+
+ // Register the global syscall table.
+ kernel.RegisterSyscallTable(slinux.AMD64)
+}
+
+// Args are the arguments for New().
+type Args struct {
+ // Id is the sandbox ID.
+ ID string
+ // Spec is the sandbox specification.
+ Spec *specs.Spec
+ // Conf is the system configuration.
+ Conf *Config
+ // ControllerFD is the FD to the URPC controller.
+ ControllerFD int
+ // Device is an optional argument that is passed to the platform.
+ Device *os.File
+ // GoferFDs is an array of FDs used to connect with the Gofer.
+ GoferFDs []int
+ // StdioFDs is the stdio for the application.
+ StdioFDs []int
+ // Console is set to true if using TTY.
+ Console bool
+ // NumCPU is the number of CPUs to create inside the sandbox.
+ NumCPU int
+ // TotalMem is the initial amount of total memory to report back to the
+ // container.
+ TotalMem uint64
+ // UserLogFD is the file descriptor to write user logs to.
+ UserLogFD int
+}
+
+// New initializes a new kernel loader configured by spec.
+// New also handles setting up a kernel for restoring a container.
+func New(args Args) (*Loader, error) {
+ // We initialize the rand package now to make sure /dev/urandom is pre-opened
+ // on kernels that do not support getrandom(2).
+ if err := rand.Init(); err != nil {
+ return nil, fmt.Errorf("setting up rand: %v", err)
+ }
+
+ if err := usage.Init(); err != nil {
+ return nil, fmt.Errorf("setting up memory usage: %v", err)
+ }
+
+ // Create kernel and platform.
+ p, err := createPlatform(args.Conf, args.Device)
+ if err != nil {
+ return nil, fmt.Errorf("creating platform: %v", err)
+ }
+ k := &kernel.Kernel{
+ Platform: p,
+ }
+
+ // Create memory file.
+ mf, err := createMemoryFile()
+ if err != nil {
+ return nil, fmt.Errorf("creating memory file: %v", err)
+ }
+ k.SetMemoryFile(mf)
+
+ // Create VDSO.
+ //
+ // Pass k as the platform since it is savable, unlike the actual platform.
+ vdso, err := loader.PrepareVDSO(k)
+ if err != nil {
+ return nil, fmt.Errorf("creating vdso: %v", err)
+ }
+
+ // Create timekeeper.
+ tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+ if err != nil {
+ return nil, fmt.Errorf("creating timekeeper: %v", err)
+ }
+ tk.SetClocks(time.NewCalibratedClocks())
+
+ if err := enableStrace(args.Conf); err != nil {
+ return nil, fmt.Errorf("enabling strace: %v", err)
+ }
+
+ // Create an empty network stack because the network namespace may be empty at
+ // this point. Netns is configured before Run() is called. Netstack is
+ // configured using a control uRPC message. Host network is configured inside
+ // Run().
+ networkStack, err := newEmptyNetworkStack(args.Conf, k)
+ if err != nil {
+ return nil, fmt.Errorf("creating network: %v", err)
+ }
+
+ // Create capabilities.
+ caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
+ if err != nil {
+ return nil, fmt.Errorf("converting capabilities: %v", err)
+ }
+
+ // Convert the spec's additional GIDs to KGIDs.
+ extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
+ for _, GID := range args.Spec.Process.User.AdditionalGids {
+ extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+ }
+
+ // Create credentials.
+ creds := auth.NewUserCredentials(
+ auth.KUID(args.Spec.Process.User.UID),
+ auth.KGID(args.Spec.Process.User.GID),
+ extraKGIDs,
+ caps,
+ auth.NewRootUserNamespace())
+
+ if args.NumCPU == 0 {
+ args.NumCPU = runtime.NumCPU()
+ }
+ log.Infof("CPUs: %d", args.NumCPU)
+
+ if args.TotalMem > 0 {
+ // Adjust the total memory returned by the Sentry so that applications that
+ // use /proc/meminfo can make allocations based on this limit.
+ usage.MinimumTotalMemoryBytes = args.TotalMem
+ log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30))
+ }
+
+ // Initiate the Kernel object, which is required by the Context passed
+ // to createVFS in order to mount (among other things) procfs.
+ if err = k.Init(kernel.InitKernelArgs{
+ FeatureSet: cpuid.HostFeatureSet(),
+ Timekeeper: tk,
+ RootUserNamespace: creds.UserNamespace,
+ NetworkStack: networkStack,
+ ApplicationCores: uint(args.NumCPU),
+ Vdso: vdso,
+ RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
+ RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace),
+ RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+ }); err != nil {
+ return nil, fmt.Errorf("initializing kernel: %v", err)
+ }
+
+ if err := adjustDirentCache(k); err != nil {
+ return nil, err
+ }
+
+ // Turn on packet logging if enabled.
+ if args.Conf.LogPackets {
+ log.Infof("Packet logging enabled")
+ atomic.StoreUint32(&sniffer.LogPackets, 1)
+ } else {
+ log.Infof("Packet logging disabled")
+ atomic.StoreUint32(&sniffer.LogPackets, 0)
+ }
+
+ // Create a watchdog.
+ watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+
+ procArgs, err := newProcess(args.ID, args.Spec, creds, k)
+ if err != nil {
+ return nil, fmt.Errorf("creating init process for root container: %v", err)
+ }
+
+ if err := initCompatLogs(args.UserLogFD); err != nil {
+ return nil, fmt.Errorf("initializing compat logs: %v", err)
+ }
+
+ eid := execID{cid: args.ID}
+ l := &Loader{
+ k: k,
+ conf: args.Conf,
+ console: args.Console,
+ watchdog: watchdog,
+ spec: args.Spec,
+ goferFDs: args.GoferFDs,
+ stdioFDs: args.StdioFDs,
+ rootProcArgs: procArgs,
+ sandboxID: args.ID,
+ processes: map[execID]*execProcess{eid: {}},
+ }
+
+ // We don't care about child signals; some platforms can generate a
+ // tremendous number of useless ones (I'm looking at you, ptrace).
+ if err := sighandling.IgnoreChildStop(); err != nil {
+ return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
+ }
+
+ // Handle signals by forwarding them to the root container process
+ // (except for panic signal, which should cause a panic).
+ l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
+ // Panic signal should cause a panic.
+ if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
+ panic("Signal-induced panic")
+ }
+
+ // Otherwise forward to root container.
+ deliveryMode := DeliverToProcess
+ if args.Console {
+ // Since we are running with a console, we should
+ // forward the signal to the foreground process group
+ // so that job control signals like ^C can be handled
+ // properly.
+ deliveryMode = DeliverToForegroundProcessGroup
+ }
+ log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+ if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
+ log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
+ }
+ })
+
+ // Create the control server using the provided FD.
+ //
+ // This must be done *after* we have initialized the kernel since the
+ // controller is used to configure the kernel's network stack.
+ ctrl, err := newController(args.ControllerFD, l)
+ if err != nil {
+ return nil, fmt.Errorf("creating control server: %v", err)
+ }
+ l.ctrl = ctrl
+
+ // Only start serving after Loader is set to controller and controller is set
+ // to Loader, because they are both used in the urpc methods.
+ if err := ctrl.srv.StartServing(); err != nil {
+ return nil, fmt.Errorf("starting control server: %v", err)
+ }
+
+ return l, nil
+}
+
+// newProcess creates a process that can be run with kernel.CreateProcess.
+func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+ // Create initial limits.
+ ls, err := createLimitSet(spec)
+ if err != nil {
+ return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
+ }
+
+ // Create the process arguments.
+ procArgs := kernel.CreateProcessArgs{
+ Argv: spec.Process.Args,
+ Envv: spec.Process.Env,
+ WorkingDirectory: spec.Process.Cwd, // Defaults to '/' if empty.
+ Credentials: creds,
+ Umask: 0022,
+ Limits: ls,
+ MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+ UTSNamespace: k.RootUTSNamespace(),
+ IPCNamespace: k.RootIPCNamespace(),
+ AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
+ ContainerID: id,
+ }
+ return procArgs, nil
+}
+
+// Destroy cleans up all resources used by the loader.
+//
+// Note that this will block until all open control server connections have
+// been closed. For that reason, this should NOT be called in a defer, because
+// a panic in a control server rpc would then hang forever.
+func (l *Loader) Destroy() {
+ if l.ctrl != nil {
+ l.ctrl.srv.Stop()
+ }
+ if l.stopSignalForwarding != nil {
+ l.stopSignalForwarding()
+ }
+ l.watchdog.Stop()
+}
+
+func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+ switch conf.Platform {
+ case PlatformPtrace:
+ log.Infof("Platform: ptrace")
+ return ptrace.New()
+ case PlatformKVM:
+ log.Infof("Platform: kvm")
+ if deviceFile == nil {
+ return nil, fmt.Errorf("kvm device file must be provided")
+ }
+ return kvm.New(deviceFile)
+ default:
+ return nil, fmt.Errorf("invalid platform %v", conf.Platform)
+ }
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+ const memfileName = "runsc-memory"
+ memfd, err := memutil.CreateMemFD(memfileName, 0)
+ if err != nil {
+ return nil, fmt.Errorf("error creating memfd: %v", err)
+ }
+ memfile := os.NewFile(uintptr(memfd), memfileName)
+ mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+ if err != nil {
+ memfile.Close()
+ return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+ }
+ return mf, nil
+}
+
+// Run runs the root container..
+func (l *Loader) Run() error {
+ err := l.run()
+ l.ctrl.manager.startResultChan <- err
+ if err != nil {
+ // Give the controller some time to send the error to the
+ // runtime. If we return too quickly here the process will exit
+ // and the control connection will be closed before the error
+ // is returned.
+ gtime.Sleep(2 * gtime.Second)
+ return err
+ }
+ return nil
+}
+
+func (l *Loader) run() error {
+ if l.conf.Network == NetworkHost {
+ // Delay host network configuration to this point because network namespace
+ // is configured after the loader is created and before Run() is called.
+ log.Debugf("Configuring host network")
+ stack := l.k.NetworkStack().(*hostinet.Stack)
+ if err := stack.Configure(); err != nil {
+ return err
+ }
+ }
+
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ eid := execID{cid: l.sandboxID}
+ ep, ok := l.processes[eid]
+ if !ok {
+ return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
+ }
+
+ // Finally done with all configuration. Setup filters before user code
+ // is loaded.
+ if l.conf.DisableSeccomp {
+ filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+ } else {
+ opts := filter.Options{
+ Platform: l.k.Platform,
+ HostNetwork: l.conf.Network == NetworkHost,
+ ProfileEnable: l.conf.ProfileEnable,
+ ControllerFD: l.ctrl.srv.FD(),
+ }
+ if err := filter.Install(opts); err != nil {
+ return fmt.Errorf("installing seccomp filters: %v", err)
+ }
+ }
+
+ // If we are restoring, we do not want to create a process.
+ // l.restore is set by the container manager when a restore call is made.
+ if !l.restore {
+ if err := setupContainerFS(
+ &l.rootProcArgs,
+ l.spec,
+ l.conf,
+ l.stdioFDs,
+ l.goferFDs,
+ l.console,
+ l.rootProcArgs.Credentials,
+ l.rootProcArgs.Limits,
+ l.k,
+ "" /* CID, which isn't needed for the root container */); err != nil {
+ return err
+ }
+
+ rootCtx := l.rootProcArgs.NewContext(l.k)
+ rootMns := l.k.RootMountNamespace()
+ if err := setExecutablePath(rootCtx, rootMns, &l.rootProcArgs); err != nil {
+ return err
+ }
+
+ // Create the root container init task. It will begin running
+ // when the kernel is started.
+ if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+ return fmt.Errorf("creating init process: %v", err)
+ }
+
+ // CreateProcess takes a reference on FDMap if successful.
+ l.rootProcArgs.FDMap.DecRef()
+ }
+
+ ep.tg = l.k.GlobalInit()
+ if l.console {
+ ttyFile := l.rootProcArgs.FDMap.GetFile(0)
+ defer ttyFile.DecRef()
+ ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
+
+ // Set the foreground process group on the TTY to the global
+ // init process group, since that is what we are about to
+ // start running.
+ ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+ }
+
+ // Start signal forwarding only after an init process is created.
+ l.stopSignalForwarding = l.startSignalForwarding()
+
+ log.Infof("Process should have started...")
+ l.watchdog.Start()
+ return l.k.Start()
+}
+
+// createContainer creates a new container inside the sandbox.
+func (l *Loader) createContainer(cid string) error {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ eid := execID{cid: cid}
+ if _, ok := l.processes[eid]; ok {
+ return fmt.Errorf("container %q already exists", cid)
+ }
+ l.processes[eid] = &execProcess{}
+ return nil
+}
+
+// startContainer starts a child container. It returns the thread group ID of
+// the newly created process. Caller owns 'files' and may close them after
+// this method returns.
+func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+ // Create capabilities.
+ caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
+ if err != nil {
+ return fmt.Errorf("creating capabilities: %v", err)
+ }
+
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ eid := execID{cid: cid}
+ if _, ok := l.processes[eid]; !ok {
+ return fmt.Errorf("trying to start a deleted container %q", cid)
+ }
+
+ // Convert the spec's additional GIDs to KGIDs.
+ extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+ for _, GID := range spec.Process.User.AdditionalGids {
+ extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+ }
+
+ // Create credentials. We reuse the root user namespace because the
+ // sentry currently supports only 1 mount namespace, which is tied to a
+ // single user namespace. Thus we must run in the same user namespace
+ // to access mounts.
+ // TODO(b/63601033): Create a new mount namespace for the container.
+ creds := auth.NewUserCredentials(
+ auth.KUID(spec.Process.User.UID),
+ auth.KGID(spec.Process.User.GID),
+ extraKGIDs,
+ caps,
+ l.k.RootUserNamespace())
+
+ procArgs, err := newProcess(cid, spec, creds, l.k)
+ if err != nil {
+ return fmt.Errorf("creating new process: %v", err)
+ }
+
+ // setupContainerFS() dups stdioFDs, so we don't need to dup them here.
+ var stdioFDs []int
+ for _, f := range files[:3] {
+ stdioFDs = append(stdioFDs, int(f.Fd()))
+ }
+
+ // Can't take ownership away from os.File. dup them to get a new FDs.
+ var goferFDs []int
+ for _, f := range files[3:] {
+ fd, err := syscall.Dup(int(f.Fd()))
+ if err != nil {
+ return fmt.Errorf("failed to dup file: %v", err)
+ }
+ goferFDs = append(goferFDs, fd)
+ }
+
+ if err := setupContainerFS(
+ &procArgs,
+ spec,
+ conf,
+ stdioFDs,
+ goferFDs,
+ false,
+ creds,
+ procArgs.Limits,
+ k,
+ cid); err != nil {
+ return fmt.Errorf("configuring container FS: %v", err)
+ }
+
+ ctx := procArgs.NewContext(l.k)
+ mns := k.RootMountNamespace()
+ if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
+ return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
+ }
+
+ // Create and start the new process.
+ tg, _, err := l.k.CreateProcess(procArgs)
+ if err != nil {
+ return fmt.Errorf("creating process: %v", err)
+ }
+ l.k.StartProcess(tg)
+
+ // CreateProcess takes a reference on FDMap if successful.
+ procArgs.FDMap.DecRef()
+
+ l.processes[eid].tg = tg
+ return nil
+}
+
+// destroyContainer stops a container if it is still running and cleans up its
+// filesystem.
+func (l *Loader) destroyContainer(cid string) error {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ // Has the container started?
+ if _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}); err == nil {
+ // If the container has started, kill and wait for all processes.
+ if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+ return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
+ }
+ }
+
+ // Remove all container thread groups from the map.
+ for key := range l.processes {
+ if key.cid == cid {
+ delete(l.processes, key)
+ }
+ }
+
+ ctx := l.rootProcArgs.NewContext(l.k)
+ if err := destroyContainerFS(ctx, cid, l.k); err != nil {
+ return fmt.Errorf("destroying filesystem for container %q: %v", cid, err)
+ }
+
+ // We made it!
+ log.Debugf("Container destroyed %q", cid)
+ return nil
+}
+
+func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
+ // Hold the lock for the entire operation to ensure that exec'd process is
+ // added to 'processes' in case it races with destroyContainer().
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ tg, _, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
+ if err != nil {
+ return 0, fmt.Errorf("no such container: %q", args.ContainerID)
+ }
+
+ // Get the container Root Dirent from the Task, since we must run this
+ // process with the same Root.
+ tg.Leader().WithMuLocked(func(t *kernel.Task) {
+ args.Root = t.FSContext().RootDirectory()
+ })
+ if args.Root != nil {
+ defer args.Root.DecRef()
+ }
+
+ // Start the process.
+ proc := control.Proc{Kernel: l.k}
+ newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
+ if err != nil {
+ return 0, err
+ }
+
+ eid := execID{cid: args.ContainerID, pid: tgid}
+ l.processes[eid] = &execProcess{
+ tg: newTG,
+ tty: ttyFile,
+ }
+ log.Debugf("updated processes: %v", l.processes)
+
+ return tgid, nil
+}
+
+// waitContainer waits for the init process of a container to exit.
+func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
+ // Don't defer unlock, as doing so would make it impossible for
+ // multiple clients to wait on the same container.
+ tg, _, err := l.threadGroupFromID(execID{cid: cid})
+ if err != nil {
+ return fmt.Errorf("can't wait for container %q: %v", cid, err)
+ }
+
+ // If the thread either has already exited or exits during waiting,
+ // consider the container exited.
+ ws := l.wait(tg)
+ *waitStatus = ws
+ return nil
+}
+
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
+ if tgid <= 0 {
+ return fmt.Errorf("PID (%d) must be positive", tgid)
+ }
+
+ // Try to find a process that was exec'd
+ eid := execID{cid: cid, pid: tgid}
+ execTG, _, err := l.threadGroupFromID(eid)
+ if err == nil {
+ ws := l.wait(execTG)
+ *waitStatus = ws
+
+ // Remove tg from the cache if caller requested it.
+ if clearStatus {
+ l.mu.Lock()
+ delete(l.processes, eid)
+ log.Debugf("updated processes (removal): %v", l.processes)
+ l.mu.Unlock()
+ }
+ return nil
+ }
+
+ // The caller may be waiting on a process not started directly via exec.
+ // In this case, find the process in the container's PID namespace.
+ initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+ if err != nil {
+ return fmt.Errorf("waiting for PID %d: %v", tgid, err)
+ }
+ tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+ if tg == nil {
+ return fmt.Errorf("waiting for PID %d: no such process", tgid)
+ }
+ if tg.Leader().ContainerID() != cid {
+ return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+ }
+ ws := l.wait(tg)
+ *waitStatus = ws
+ return nil
+}
+
+// wait waits for the process with TGID 'tgid' in a container's PID namespace
+// to exit.
+func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
+ tg.WaitExited()
+ return tg.ExitStatus().Status()
+}
+
+// WaitForStartSignal waits for a start signal from the control server.
+func (l *Loader) WaitForStartSignal() {
+ <-l.ctrl.manager.startChan
+}
+
+// WaitExit waits for the root container to exit, and returns its exit status.
+func (l *Loader) WaitExit() kernel.ExitStatus {
+ // Wait for container.
+ l.k.WaitExited()
+
+ return l.k.GlobalInit().ExitStatus()
+}
+
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
+ switch conf.Network {
+ case NetworkHost:
+ return hostinet.NewStack(), nil
+
+ case NetworkNone, NetworkSandbox:
+ // NetworkNone sets up loopback using netstack.
+ netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
+ protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4}
+ s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
+ Clock: clock,
+ Stats: epsocket.Metrics,
+ HandleLocal: true,
+ // Enable raw sockets for users with sufficient
+ // privileges.
+ Raw: true,
+ })}
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+ return nil, fmt.Errorf("failed to enable SACK: %v", err)
+ }
+ return &s, nil
+
+ default:
+ panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+ }
+}
+
+// signal sends a signal to one or more processes in a container. If PID is 0,
+// then the container init process is used. Depending on the SignalDeliveryMode
+// option, the signal may be sent directly to the indicated process, to all
+// processes in the container, or to the foreground process group.
+func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
+ if pid < 0 {
+ return fmt.Errorf("PID (%d) must be positive", pid)
+ }
+
+ switch mode {
+ case DeliverToProcess:
+ if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
+ return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err)
+ }
+ return nil
+
+ case DeliverToForegroundProcessGroup:
+ if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
+ return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err)
+ }
+ return nil
+
+ case DeliverToAllProcesses:
+ if pid != 0 {
+ return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
+ }
+ // Check that the container has actually started before signaling it.
+ _, _, err := l.threadGroupFromID(execID{cid: cid})
+ if err != nil {
+ return err
+ }
+ if err := l.signalAllProcesses(cid, signo); err != nil {
+ return fmt.Errorf("signaling all processes in container %q: %v", cid, err)
+ }
+ return nil
+
+ default:
+ panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
+ }
+}
+
+func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
+ execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+ if err == nil {
+ // Send signal directly to the identified process.
+ return execTG.SendSignal(&arch.SignalInfo{Signo: signo})
+ }
+
+ // The caller may be signaling a process not started directly via exec.
+ // In this case, find the process in the container's PID namespace and
+ // signal it.
+ initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+ if err != nil {
+ return fmt.Errorf("no thread group found: %v", err)
+ }
+ tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+ if tg == nil {
+ return fmt.Errorf("no such process with PID %d", tgid)
+ }
+ if tg.Leader().ContainerID() != cid {
+ return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+ }
+ return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+}
+
+func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
+ // Lookup foreground process group from the TTY for the given process,
+ // and send the signal to it.
+ tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+ if err != nil {
+ return fmt.Errorf("no thread group found: %v", err)
+ }
+ if tty == nil {
+ return fmt.Errorf("no TTY attached")
+ }
+ pg := tty.ForegroundProcessGroup()
+ if pg == nil {
+ // No foreground process group has been set. Signal the
+ // original thread group.
+ log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
+ return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+ }
+ // Send the signal to all processes in the process group.
+ var lastErr error
+ for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+ if tg.ProcessGroup() != pg {
+ continue
+ }
+ if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+ lastErr = err
+ }
+ }
+ return lastErr
+}
+
+// signalAllProcesses that belong to specified container. It's a noop if the
+// container hasn't started or has exited.
+func (l *Loader) signalAllProcesses(cid string, signo int32) error {
+ // Pause the kernel to prevent new processes from being created while
+ // the signal is delivered. This prevents process leaks when SIGKILL is
+ // sent to the entire container.
+ l.k.Pause()
+ if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
+ l.k.Unpause()
+ return err
+ }
+ l.k.Unpause()
+
+ // If SIGKILLing all processes, wait for them to exit.
+ if linux.Signal(signo) == linux.SIGKILL {
+ for _, t := range l.k.TaskSet().Root.Tasks() {
+ if t.ContainerID() == cid {
+ t.ThreadGroup().WaitExited()
+ }
+ }
+ }
+ return nil
+}
+
+// threadGroupFromID same as threadGroupFromIDLocked except that it acquires
+// mutex before calling it.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+ return l.threadGroupFromIDLocked(key)
+}
+
+// threadGroupFromIDLocked returns the thread group and TTY for the given
+// execution ID. TTY may be nil if the process is not attached to a terminal.
+// Returns error if execution ID is invalid or if container/process has not
+// started yet. Caller must hold 'mu'.
+func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+ ep := l.processes[key]
+ if ep == nil {
+ return nil, nil, fmt.Errorf("container not found")
+ }
+ if ep.tg == nil {
+ return nil, nil, fmt.Errorf("container not started")
+ }
+ return ep.tg, ep.tty, nil
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
new file mode 100644
index 000000000..0a154d90b
--- /dev/null
+++ b/runsc/boot/network.go
@@ -0,0 +1,222 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "net"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// Network exposes methods that can be used to configure a network stack.
+type Network struct {
+ Stack *stack.Stack
+}
+
+// Route represents a route in the network stack.
+type Route struct {
+ Destination net.IP
+ Mask net.IPMask
+ Gateway net.IP
+}
+
+// DefaultRoute represents a catch all route to the default gateway.
+type DefaultRoute struct {
+ Route Route
+ Name string
+}
+
+// FDBasedLink configures an fd-based link.
+type FDBasedLink struct {
+ Name string
+ MTU int
+ Addresses []net.IP
+ Routes []Route
+ GSOMaxSize uint32
+ LinkAddress []byte
+}
+
+// LoopbackLink configures a loopback li nk.
+type LoopbackLink struct {
+ Name string
+ Addresses []net.IP
+ Routes []Route
+}
+
+// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
+type CreateLinksAndRoutesArgs struct {
+ // FilePayload contains the fds associated with the FDBasedLinks. The
+ // two slices must have the same length.
+ urpc.FilePayload
+
+ LoopbackLinks []LoopbackLink
+ FDBasedLinks []FDBasedLink
+
+ DefaultGateway DefaultRoute
+}
+
+// Empty returns true if route hasn't been set.
+func (r *Route) Empty() bool {
+ return r.Destination == nil && r.Mask == nil && r.Gateway == nil
+}
+
+func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
+ return tcpip.Route{
+ Destination: ipToAddress(r.Destination),
+ Gateway: ipToAddress(r.Gateway),
+ Mask: ipToAddressMask(net.IP(r.Mask)),
+ NIC: id,
+ }
+}
+
+// CreateLinksAndRoutes creates links and routes in a network stack. It should
+// only be called once.
+func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
+ if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
+ return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+ }
+
+ var nicID tcpip.NICID
+ nicids := make(map[string]tcpip.NICID)
+
+ // Collect routes from all links.
+ var routes []tcpip.Route
+
+ // Loopback normally appear before other interfaces.
+ for _, link := range args.LoopbackLinks {
+ nicID++
+ nicids[link.Name] = nicID
+
+ linkEP := loopback.New()
+
+ log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+ if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil {
+ return err
+ }
+
+ // Collect the routes from this link.
+ for _, r := range link.Routes {
+ routes = append(routes, r.toTcpipRoute(nicID))
+ }
+ }
+
+ for i, link := range args.FDBasedLinks {
+ nicID++
+ nicids[link.Name] = nicID
+
+ // Copy the underlying FD.
+ oldFD := args.FilePayload.Files[i].Fd()
+ newFD, err := syscall.Dup(int(oldFD))
+ if err != nil {
+ return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ }
+
+ mac := tcpip.LinkAddress(link.LinkAddress)
+ linkEP, err := fdbased.New(&fdbased.Options{
+ FD: newFD,
+ MTU: uint32(link.MTU),
+ EthernetHeader: true,
+ Address: mac,
+ PacketDispatchMode: fdbased.RecvMMsg,
+ GSOMaxSize: link.GSOMaxSize,
+ RXChecksumOffload: true,
+ })
+ if err != nil {
+ return err
+ }
+
+ log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
+ if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
+ return err
+ }
+
+ // Collect the routes from this link.
+ for _, r := range link.Routes {
+ routes = append(routes, r.toTcpipRoute(nicID))
+ }
+ }
+
+ if !args.DefaultGateway.Route.Empty() {
+ nicID, ok := nicids[args.DefaultGateway.Name]
+ if !ok {
+ return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+ }
+ routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID))
+ }
+
+ log.Infof("Setting routes %+v", routes)
+ n.Stack.SetRouteTable(routes)
+ return nil
+}
+
+// createNICWithAddrs creates a NIC in the network stack and adds the given
+// addresses.
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP, loopback bool) error {
+ if loopback {
+ if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(linkEP)); err != nil {
+ return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+ }
+ } else {
+ if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
+ return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+ }
+ }
+
+ // Always start with an arp address for the NIC.
+ if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+ return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
+ }
+
+ for _, addr := range addrs {
+ proto, tcpipAddr := ipToAddressAndProto(addr)
+ if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+ return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
+ }
+ }
+ return nil
+}
+
+// ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
+//
+// Note: don't use 'len(ip)' to determine IP version because length is always 16.
+func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
+ if i4 := ip.To4(); i4 != nil {
+ return ipv4.ProtocolNumber, tcpip.Address(i4)
+ }
+ return ipv6.ProtocolNumber, tcpip.Address(ip)
+}
+
+// ipToAddress converts IP to tcpip.Address, ignoring the protocol.
+func ipToAddress(ip net.IP) tcpip.Address {
+ _, addr := ipToAddressAndProto(ip)
+ return addr
+}
+
+// ipToAddressMask converts IP to tcpip.AddressMask, ignoring the protocol.
+func ipToAddressMask(ip net.IP) tcpip.AddressMask {
+ _, addr := ipToAddressAndProto(ip)
+ return tcpip.AddressMask(addr)
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
new file mode 100644
index 000000000..19c7f8fbd
--- /dev/null
+++ b/runsc/boot/strace.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+)
+
+func enableStrace(conf *Config) error {
+ // We must initialize even if strace is not enabled.
+ strace.Initialize()
+
+ if !conf.Strace {
+ return nil
+ }
+
+ max := conf.StraceLogSize
+ if max == 0 {
+ max = 1024
+ }
+ strace.LogMaximumSize = max
+
+ if len(conf.StraceSyscalls) == 0 {
+ strace.EnableAll(strace.SinkTypeLog)
+ return nil
+ }
+ return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+}
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
new file mode 100644
index 000000000..7431b17d6
--- /dev/null
+++ b/runsc/cgroup/cgroup.go
@@ -0,0 +1,503 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cgroup provides an interface to read and write configuration to
+// cgroup.
+package cgroup
+
+import (
+ "bufio"
+ "context"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/cenkalti/backoff"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ cgroupRoot = "/sys/fs/cgroup"
+)
+
+var controllers = map[string]controller{
+ "blkio": &blockIO{},
+ "cpu": &cpu{},
+ "cpuset": &cpuSet{},
+ "memory": &memory{},
+ "net_cls": &networkClass{},
+ "net_prio": &networkPrio{},
+
+ // These controllers either don't have anything in the OCI spec or is
+ // irrevalant for a sandbox, e.g. pids.
+ "devices": &noop{},
+ "freezer": &noop{},
+ "perf_event": &noop{},
+ "pids": &noop{},
+ "systemd": &noop{},
+}
+
+func setOptionalValueInt(path, name string, val *int64) error {
+ if val == nil || *val == 0 {
+ return nil
+ }
+ str := strconv.FormatInt(*val, 10)
+ return setValue(path, name, str)
+}
+
+func setOptionalValueUint(path, name string, val *uint64) error {
+ if val == nil || *val == 0 {
+ return nil
+ }
+ str := strconv.FormatUint(*val, 10)
+ return setValue(path, name, str)
+}
+
+func setOptionalValueUint32(path, name string, val *uint32) error {
+ if val == nil || *val == 0 {
+ return nil
+ }
+ str := strconv.FormatUint(uint64(*val), 10)
+ return setValue(path, name, str)
+}
+
+func setOptionalValueUint16(path, name string, val *uint16) error {
+ if val == nil || *val == 0 {
+ return nil
+ }
+ str := strconv.FormatUint(uint64(*val), 10)
+ return setValue(path, name, str)
+}
+
+func setValue(path, name, data string) error {
+ fullpath := filepath.Join(path, name)
+ return ioutil.WriteFile(fullpath, []byte(data), 0700)
+}
+
+func getValue(path, name string) (string, error) {
+ fullpath := filepath.Join(path, name)
+ out, err := ioutil.ReadFile(fullpath)
+ if err != nil {
+ return "", err
+ }
+ return string(out), nil
+}
+
+// fillFromAncestor sets the value of a cgroup file from the first ancestor
+// that has content. It does nothing if the file in 'path' has already been set.
+func fillFromAncestor(path string) (string, error) {
+ out, err := ioutil.ReadFile(path)
+ if err != nil {
+ return "", err
+ }
+ val := strings.TrimSpace(string(out))
+ if val != "" {
+ // File is set, stop here.
+ return val, nil
+ }
+
+ // File is not set, recurse to parent and then set here.
+ name := filepath.Base(path)
+ parent := filepath.Dir(filepath.Dir(path))
+ val, err = fillFromAncestor(filepath.Join(parent, name))
+ if err != nil {
+ return "", err
+ }
+ if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil {
+ return "", err
+ }
+ return val, nil
+}
+
+// countCpuset returns the number of CPU in a string formatted like:
+// "0-2,7,12-14 # bits 0, 1, 2, 7, 12, 13, and 14 set" - man 7 cpuset
+func countCpuset(cpuset string) (int, error) {
+ var count int
+ for _, p := range strings.Split(cpuset, ",") {
+ interval := strings.Split(p, "-")
+ switch len(interval) {
+ case 1:
+ if _, err := strconv.Atoi(interval[0]); err != nil {
+ return 0, err
+ }
+ count++
+
+ case 2:
+ start, err := strconv.Atoi(interval[0])
+ if err != nil {
+ return 0, err
+ }
+ end, err := strconv.Atoi(interval[1])
+ if err != nil {
+ return 0, err
+ }
+ if start < 0 || end < 0 || start > end {
+ return 0, fmt.Errorf("invalid cpuset: %q", p)
+ }
+ count += end - start + 1
+
+ default:
+ return 0, fmt.Errorf("invalid cpuset: %q", p)
+ }
+ }
+ return count, nil
+}
+
+// LoadPaths loads cgroup paths for given 'pid', may be set to 'self'.
+func LoadPaths(pid string) (map[string]string, error) {
+ f, err := os.Open(filepath.Join("/proc", pid, "cgroup"))
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ paths := make(map[string]string)
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ // Format: ID:controller1,controller2:path
+ // Example: 2:cpu,cpuacct:/user.slice
+ tokens := strings.Split(scanner.Text(), ":")
+ if len(tokens) != 3 {
+ return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text())
+ }
+ for _, ctrlr := range strings.Split(tokens[1], ",") {
+ paths[ctrlr] = tokens[2]
+ }
+ }
+ if err := scanner.Err(); err != nil {
+ return nil, err
+ }
+ return paths, nil
+}
+
+// Cgroup represents a group inside all controllers. For example: Name='/foo/bar'
+// maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers.
+type Cgroup struct {
+ Name string `json:"name"`
+ Parents map[string]string `json:"parents"`
+ Own bool `json:"own"`
+}
+
+// New creates a new Cgroup instance if the spec includes a cgroup path.
+// Returns nil otherwise.
+func New(spec *specs.Spec) (*Cgroup, error) {
+ if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
+ return nil, nil
+ }
+ var parents map[string]string
+ if !filepath.IsAbs(spec.Linux.CgroupsPath) {
+ var err error
+ parents, err = LoadPaths("self")
+ if err != nil {
+ return nil, fmt.Errorf("finding current cgroups: %v", err)
+ }
+ }
+ return &Cgroup{
+ Name: spec.Linux.CgroupsPath,
+ Parents: parents,
+ }, nil
+}
+
+// Install creates and configures cgroups according to 'res'. If cgroup path
+// already exists, it means that the caller has already provided a
+// pre-configured cgroups, and 'res' is ignored.
+func (c *Cgroup) Install(res *specs.LinuxResources) error {
+ if _, err := os.Stat(c.makePath("memory")); err == nil {
+ // If cgroup has already been created; it has been setup by caller. Don't
+ // make any changes to configuration, just join when sandbox/gofer starts.
+ log.Debugf("Using pre-created cgroup %q", c.Name)
+ return nil
+ }
+
+ log.Debugf("Creating cgroup %q", c.Name)
+
+ // Mark that cgroup resources are owned by me.
+ c.Own = true
+
+ // The Cleanup object cleans up partially created cgroups when an error occurs.
+ // Errors occuring during cleanup itself are ignored.
+ clean := specutils.MakeCleanup(func() { _ = c.Uninstall() })
+ defer clean.Clean()
+
+ for key, ctrl := range controllers {
+ path := c.makePath(key)
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return err
+ }
+ if res != nil {
+ if err := ctrl.set(res, path); err != nil {
+ return err
+ }
+ }
+ }
+ clean.Release()
+ return nil
+}
+
+// Uninstall removes the settings done in Install(). If cgroup path already
+// existed when Install() was called, Uninstall is a noop.
+func (c *Cgroup) Uninstall() error {
+ if !c.Own {
+ // cgroup is managed by caller, don't touch it.
+ return nil
+ }
+ log.Debugf("Deleting cgroup %q", c.Name)
+ for key := range controllers {
+ path := c.makePath(key)
+ log.Debugf("Removing cgroup controller for key=%q path=%q", key, path)
+
+ // If we try to remove the cgroup too soon after killing the
+ // sandbox we might get EBUSY, so we retry for a few seconds
+ // until it succeeds.
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+ if err := backoff.Retry(func() error {
+ err := syscall.Rmdir(path)
+ if os.IsNotExist(err) {
+ return nil
+ }
+ return err
+ }, b); err != nil {
+ return fmt.Errorf("removing cgroup path %q: %v", path, err)
+ }
+ }
+ return nil
+}
+
+// Join adds the current process to the all controllers. Returns function that
+// restores cgroup to the original state.
+func (c *Cgroup) Join() (func(), error) {
+ // First save the current state so it can be restored.
+ undo := func() {}
+ paths, err := LoadPaths("self")
+ if err != nil {
+ return undo, err
+ }
+ var undoPaths []string
+ for ctrlr, path := range paths {
+ // Skip controllers we don't handle.
+ if _, ok := controllers[ctrlr]; ok {
+ fullPath := filepath.Join(cgroupRoot, ctrlr, path)
+ undoPaths = append(undoPaths, fullPath)
+ break
+ }
+ }
+
+ // Replace empty undo with the real thing before changes are made to cgroups.
+ undo = func() {
+ for _, path := range undoPaths {
+ log.Debugf("Restoring cgroup %q", path)
+ if err := setValue(path, "cgroup.procs", "0"); err != nil {
+ log.Warningf("Error restoring cgroup %q: %v", path, err)
+ }
+ }
+ }
+
+ // Now join the cgroups.
+ for key := range controllers {
+ path := c.makePath(key)
+ log.Debugf("Joining cgroup %q", path)
+ if err := setValue(path, "cgroup.procs", "0"); err != nil {
+ return undo, err
+ }
+ }
+ return undo, nil
+}
+
+// NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
+func (c *Cgroup) NumCPU() (int, error) {
+ path := c.makePath("cpuset")
+ cpuset, err := getValue(path, "cpuset.cpus")
+ if err != nil {
+ return 0, err
+ }
+ return countCpuset(strings.TrimSpace(cpuset))
+}
+
+// MemoryLimit returns the memory limit.
+func (c *Cgroup) MemoryLimit() (uint64, error) {
+ path := c.makePath("memory")
+ limStr, err := getValue(path, "memory.limit_in_bytes")
+ if err != nil {
+ return 0, err
+ }
+ return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64)
+}
+
+func (c *Cgroup) makePath(controllerName string) string {
+ path := c.Name
+ if parent, ok := c.Parents[controllerName]; ok {
+ path = filepath.Join(parent, c.Name)
+ }
+ return filepath.Join(cgroupRoot, controllerName, path)
+}
+
+type controller interface {
+ set(*specs.LinuxResources, string) error
+}
+
+type noop struct{}
+
+func (*noop) set(*specs.LinuxResources, string) error {
+ return nil
+}
+
+type memory struct{}
+
+func (*memory) set(spec *specs.LinuxResources, path string) error {
+ if spec.Memory == nil {
+ return nil
+ }
+ if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil {
+ return err
+ }
+ if err := setOptionalValueInt(path, "memory.soft_limit_in_bytes", spec.Memory.Reservation); err != nil {
+ return err
+ }
+ if err := setOptionalValueInt(path, "memory.memsw.limit_in_bytes", spec.Memory.Swap); err != nil {
+ return err
+ }
+ if err := setOptionalValueInt(path, "memory.kmem.limit_in_bytes", spec.Memory.Kernel); err != nil {
+ return err
+ }
+ if err := setOptionalValueInt(path, "memory.kmem.tcp.limit_in_bytes", spec.Memory.KernelTCP); err != nil {
+ return err
+ }
+ if err := setOptionalValueUint(path, "memory.swappiness", spec.Memory.Swappiness); err != nil {
+ return err
+ }
+
+ if spec.Memory.DisableOOMKiller != nil && *spec.Memory.DisableOOMKiller {
+ if err := setValue(path, "memory.oom_control", "1"); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+type cpu struct{}
+
+func (*cpu) set(spec *specs.LinuxResources, path string) error {
+ if spec.CPU == nil {
+ return nil
+ }
+ if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil {
+ return err
+ }
+ if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil {
+ return err
+ }
+ return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period)
+}
+
+type cpuSet struct{}
+
+func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
+ // cpuset.cpus and mems are required fields, but are not set on a new cgroup.
+ // If not set in the spec, get it from one of the ancestors cgroup.
+ if spec.CPU == nil || spec.CPU.Cpus == "" {
+ if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil {
+ return err
+ }
+ } else {
+ if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil {
+ return err
+ }
+ }
+
+ if spec.CPU == nil || spec.CPU.Mems == "" {
+ _, err := fillFromAncestor(filepath.Join(path, "cpuset.mems"))
+ return err
+ }
+ mems := spec.CPU.Mems
+ return setValue(path, "cpuset.mems", mems)
+}
+
+type blockIO struct{}
+
+func (*blockIO) set(spec *specs.LinuxResources, path string) error {
+ if spec.BlockIO == nil {
+ return nil
+ }
+
+ if err := setOptionalValueUint16(path, "blkio.weight", spec.BlockIO.Weight); err != nil {
+ return err
+ }
+ if err := setOptionalValueUint16(path, "blkio.leaf_weight", spec.BlockIO.LeafWeight); err != nil {
+ return err
+ }
+
+ for _, dev := range spec.BlockIO.WeightDevice {
+ val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight)
+ if err := setValue(path, "blkio.weight_device", val); err != nil {
+ return err
+ }
+ val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight)
+ if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
+ return err
+ }
+ }
+ if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil {
+ return err
+ }
+ if err := setThrottle(path, "blkio.throttle.write_bps_device", spec.BlockIO.ThrottleWriteBpsDevice); err != nil {
+ return err
+ }
+ if err := setThrottle(path, "blkio.throttle.read_iops_device", spec.BlockIO.ThrottleReadIOPSDevice); err != nil {
+ return err
+ }
+ return setThrottle(path, "blkio.throttle.write_iops_device", spec.BlockIO.ThrottleWriteIOPSDevice)
+}
+
+func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error {
+ for _, dev := range devs {
+ val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate)
+ if err := setValue(path, name, val); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+type networkClass struct{}
+
+func (*networkClass) set(spec *specs.LinuxResources, path string) error {
+ if spec.Network == nil {
+ return nil
+ }
+ return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID)
+}
+
+type networkPrio struct{}
+
+func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
+ if spec.Network == nil {
+ return nil
+ }
+ for _, prio := range spec.Network.Priorities {
+ val := fmt.Sprintf("%s %d", prio.Name, prio.Priority)
+ if err := setValue(path, "net_prio.ifpriomap", val); err != nil {
+ return err
+ }
+ }
+ return nil
+}
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
new file mode 100644
index 000000000..3a547d4aa
--- /dev/null
+++ b/runsc/cmd/boot.go
@@ -0,0 +1,257 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "os"
+ "runtime/debug"
+ "strings"
+ "syscall"
+
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Boot implements subcommands.Command for the "boot" command which starts a
+// new sandbox. It should not be called directly.
+type Boot struct {
+ // bundleDir is the directory containing the OCI spec.
+ bundleDir string
+
+ // specFD is the file descriptor that the spec will be read from.
+ specFD int
+
+ // controllerFD is the file descriptor of a stream socket for the
+ // control server that is donated to this process.
+ controllerFD int
+
+ // deviceFD is the file descriptor for the platform device file.
+ deviceFD int
+
+ // ioFDs is the list of FDs used to connect to FS gofers.
+ ioFDs intFlags
+
+ // stdioFDs are the fds for stdin, stdout, and stderr. They must be
+ // provided in that order.
+ stdioFDs intFlags
+
+ // console is set to true if the sandbox should allow terminal ioctl(2)
+ // syscalls.
+ console bool
+
+ // applyCaps determines if capabilities defined in the spec should be applied
+ // to the process.
+ applyCaps bool
+
+ // setUpChroot is set to true if the sandbox is started in an empty root.
+ setUpRoot bool
+
+ // cpuNum number of CPUs to create inside the sandbox.
+ cpuNum int
+
+ // totalMem sets the initial amount of total memory to report back to the
+ // container.
+ totalMem uint64
+
+ // userLogFD is the file descriptor to write user logs to.
+ userLogFD int
+
+ // startSyncFD is the file descriptor to synchronize runsc and sandbox.
+ startSyncFD int
+
+ // mountsFD is the file descriptor to read list of mounts after they have
+ // been resolved (direct paths, no symlinks). They are resolved outside the
+ // sandbox (e.g. gofer) and sent through this FD.
+ mountsFD int
+
+ // pidns is set if the sanadbox is in its own pid namespace.
+ pidns bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Boot) Name() string {
+ return "boot"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Boot) Synopsis() string {
+ return "launch a sandbox process (internal use only)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Boot) Usage() string {
+ return `boot [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (b *Boot) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
+ f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
+ f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+ f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
+ f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
+ f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
+ f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
+ f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+ f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
+ f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
+ f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
+ f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
+ f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
+ f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
+ f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
+}
+
+// Execute implements subcommands.Command.Execute. It starts a sandbox in a
+// waiting state.
+func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ // Ensure that if there is a panic, all goroutine stacks are printed.
+ debug.SetTraceback("all")
+
+ if b.setUpRoot {
+ if err := setUpChroot(b.pidns); err != nil {
+ Fatalf("error setting up chroot: %v", err)
+ }
+
+ if !b.applyCaps {
+ // Remove --setup-root arg to call myself.
+ var args []string
+ for _, arg := range os.Args {
+ if !strings.Contains(arg, "setup-root") {
+ args = append(args, arg)
+ }
+ }
+ // Note that we've already read the spec from the spec FD, and
+ // we will read it again after the exec call. This works
+ // because the ReadSpecFromFile function seeks to the beginning
+ // of the file before reading.
+ if err := callSelfAsNobody(args); err != nil {
+ Fatalf("%v", err)
+ }
+ panic("callSelfAsNobody must never return success")
+ }
+ }
+
+ // Get the spec from the specFD.
+ specFile := os.NewFile(uintptr(b.specFD), "spec file")
+ defer specFile.Close()
+ spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile)
+ if err != nil {
+ Fatalf("reading spec: %v", err)
+ }
+ specutils.LogSpec(spec)
+
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ if b.applyCaps {
+ caps := spec.Process.Capabilities
+ if caps == nil {
+ caps = &specs.LinuxCapabilities{}
+ }
+ if conf.Platform == boot.PlatformPtrace {
+ // Ptrace platform requires extra capabilities.
+ const c = "CAP_SYS_PTRACE"
+ caps.Bounding = append(caps.Bounding, c)
+ caps.Effective = append(caps.Effective, c)
+ caps.Permitted = append(caps.Permitted, c)
+ }
+
+ // Remove --apply-caps arg to call myself.
+ var args []string
+ for _, arg := range os.Args {
+ if !strings.Contains(arg, "setup-root") && !strings.Contains(arg, "apply-caps") {
+ args = append(args, arg)
+ }
+ }
+
+ // Note that we've already read the spec from the spec FD, and
+ // we will read it again after the exec call. This works
+ // because the ReadSpecFromFile function seeks to the beginning
+ // of the file before reading.
+ if err := setCapsAndCallSelf(args, caps); err != nil {
+ Fatalf("%v", err)
+ }
+ panic("setCapsAndCallSelf must never return success")
+ }
+
+ // Read resolved mount list and replace the original one from the spec.
+ mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file")
+ cleanMounts, err := specutils.ReadMounts(mountsFile)
+ if err != nil {
+ mountsFile.Close()
+ Fatalf("Error reading mounts file: %v", err)
+ }
+ mountsFile.Close()
+ spec.Mounts = cleanMounts
+
+ // Create the loader.
+ bootArgs := boot.Args{
+ ID: f.Arg(0),
+ Spec: spec,
+ Conf: conf,
+ ControllerFD: b.controllerFD,
+ Device: os.NewFile(uintptr(b.deviceFD), "platform device"),
+ GoferFDs: b.ioFDs.GetArray(),
+ StdioFDs: b.stdioFDs.GetArray(),
+ Console: b.console,
+ NumCPU: b.cpuNum,
+ TotalMem: b.totalMem,
+ UserLogFD: b.userLogFD,
+ }
+ l, err := boot.New(bootArgs)
+ if err != nil {
+ Fatalf("creating loader: %v", err)
+ }
+
+ // Fatalf exits the process and doesn't run defers.
+ // 'l' must be destroyed explicitly after this point!
+
+ // Notify the parent process the sandbox has booted (and that the controller
+ // is up).
+ startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
+ buf := make([]byte, 1)
+ if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
+ l.Destroy()
+ Fatalf("unable to write into the start-sync descriptor: %v", err)
+ }
+ // Closes startSyncFile because 'l.Run()' only returns when the sandbox exits.
+ startSyncFile.Close()
+
+ // Wait for the start signal from runsc.
+ l.WaitForStartSignal()
+
+ // Run the application and wait for it to finish.
+ if err := l.Run(); err != nil {
+ l.Destroy()
+ Fatalf("running sandbox: %v", err)
+ }
+
+ ws := l.WaitExit()
+ log.Infof("application exiting with %+v", ws)
+ *waitStatus = syscall.WaitStatus(ws.Status())
+ l.Destroy()
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
new file mode 100644
index 000000000..312e5b471
--- /dev/null
+++ b/runsc/cmd/capability.go
@@ -0,0 +1,157 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "fmt"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/syndtr/gocapability/capability"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+var allCapTypes = []capability.CapType{
+ capability.BOUNDS,
+ capability.EFFECTIVE,
+ capability.PERMITTED,
+ capability.INHERITABLE,
+ capability.AMBIENT,
+}
+
+// applyCaps applies the capabilities in the spec to the current thread.
+//
+// Note that it must be called with current thread locked.
+func applyCaps(caps *specs.LinuxCapabilities) error {
+ // Load current capabilities to trim the ones not permitted.
+ curCaps, err := capability.NewPid2(0)
+ if err != nil {
+ return err
+ }
+ if err := curCaps.Load(); err != nil {
+ return err
+ }
+
+ // Create an empty capability set to populate.
+ newCaps, err := capability.NewPid2(0)
+ if err != nil {
+ return err
+ }
+
+ for _, c := range allCapTypes {
+ if !newCaps.Empty(c) {
+ panic("unloaded capabilities must be empty")
+ }
+ set, err := trimCaps(getCaps(c, caps), curCaps)
+ if err != nil {
+ return err
+ }
+ newCaps.Set(c, set...)
+ }
+
+ if err := newCaps.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS); err != nil {
+ return err
+ }
+ log.Infof("Capabilities applied: %+v", newCaps)
+ return nil
+}
+
+func getCaps(which capability.CapType, caps *specs.LinuxCapabilities) []string {
+ switch which {
+ case capability.BOUNDS:
+ return caps.Bounding
+ case capability.EFFECTIVE:
+ return caps.Effective
+ case capability.PERMITTED:
+ return caps.Permitted
+ case capability.INHERITABLE:
+ return caps.Inheritable
+ case capability.AMBIENT:
+ return caps.Ambient
+ }
+ panic(fmt.Sprint("invalid capability type:", which))
+}
+
+func trimCaps(names []string, setter capability.Capabilities) ([]capability.Cap, error) {
+ wantedCaps, err := capsFromNames(names)
+ if err != nil {
+ return nil, err
+ }
+
+ // Trim down capabilities that aren't possible to acquire.
+ var caps []capability.Cap
+ for _, c := range wantedCaps {
+ // Capability rules are more complicated than this, but this catches most
+ // problems with tests running with non-privileged user.
+ if setter.Get(capability.PERMITTED, c) {
+ caps = append(caps, c)
+ } else {
+ log.Warningf("Capability %q is not permitted, dropping it.", c)
+ }
+ }
+ return caps, nil
+}
+
+func capsFromNames(names []string) ([]capability.Cap, error) {
+ var caps []capability.Cap
+ for _, name := range names {
+ cap, ok := capFromName[name]
+ if !ok {
+ return nil, fmt.Errorf("invalid capability %q", name)
+ }
+ caps = append(caps, cap)
+ }
+ return caps, nil
+}
+
+var capFromName = map[string]capability.Cap{
+ "CAP_CHOWN": capability.CAP_CHOWN,
+ "CAP_DAC_OVERRIDE": capability.CAP_DAC_OVERRIDE,
+ "CAP_DAC_READ_SEARCH": capability.CAP_DAC_READ_SEARCH,
+ "CAP_FOWNER": capability.CAP_FOWNER,
+ "CAP_FSETID": capability.CAP_FSETID,
+ "CAP_KILL": capability.CAP_KILL,
+ "CAP_SETGID": capability.CAP_SETGID,
+ "CAP_SETUID": capability.CAP_SETUID,
+ "CAP_SETPCAP": capability.CAP_SETPCAP,
+ "CAP_LINUX_IMMUTABLE": capability.CAP_LINUX_IMMUTABLE,
+ "CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE,
+ "CAP_NET_BROADCAST": capability.CAP_NET_BROADCAST,
+ "CAP_NET_ADMIN": capability.CAP_NET_ADMIN,
+ "CAP_NET_RAW": capability.CAP_NET_RAW,
+ "CAP_IPC_LOCK": capability.CAP_IPC_LOCK,
+ "CAP_IPC_OWNER": capability.CAP_IPC_OWNER,
+ "CAP_SYS_MODULE": capability.CAP_SYS_MODULE,
+ "CAP_SYS_RAWIO": capability.CAP_SYS_RAWIO,
+ "CAP_SYS_CHROOT": capability.CAP_SYS_CHROOT,
+ "CAP_SYS_PTRACE": capability.CAP_SYS_PTRACE,
+ "CAP_SYS_PACCT": capability.CAP_SYS_PACCT,
+ "CAP_SYS_ADMIN": capability.CAP_SYS_ADMIN,
+ "CAP_SYS_BOOT": capability.CAP_SYS_BOOT,
+ "CAP_SYS_NICE": capability.CAP_SYS_NICE,
+ "CAP_SYS_RESOURCE": capability.CAP_SYS_RESOURCE,
+ "CAP_SYS_TIME": capability.CAP_SYS_TIME,
+ "CAP_SYS_TTY_CONFIG": capability.CAP_SYS_TTY_CONFIG,
+ "CAP_MKNOD": capability.CAP_MKNOD,
+ "CAP_LEASE": capability.CAP_LEASE,
+ "CAP_AUDIT_WRITE": capability.CAP_AUDIT_WRITE,
+ "CAP_AUDIT_CONTROL": capability.CAP_AUDIT_CONTROL,
+ "CAP_SETFCAP": capability.CAP_SETFCAP,
+ "CAP_MAC_OVERRIDE": capability.CAP_MAC_OVERRIDE,
+ "CAP_MAC_ADMIN": capability.CAP_MAC_ADMIN,
+ "CAP_SYSLOG": capability.CAP_SYSLOG,
+ "CAP_WAKE_ALARM": capability.CAP_WAKE_ALARM,
+ "CAP_BLOCK_SUSPEND": capability.CAP_BLOCK_SUSPEND,
+ "CAP_AUDIT_READ": capability.CAP_AUDIT_READ,
+}
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
new file mode 100644
index 000000000..96d3c3378
--- /dev/null
+++ b/runsc/cmd/checkpoint.go
@@ -0,0 +1,150 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "os"
+ "path/filepath"
+ "syscall"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// File containing the container's saved image/state within the given image-path's directory.
+const checkpointFileName = "checkpoint.img"
+
+// Checkpoint implements subcommands.Command for the "checkpoint" command.
+type Checkpoint struct {
+ imagePath string
+ leaveRunning bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Checkpoint) Name() string {
+ return "checkpoint"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Checkpoint) Synopsis() string {
+ return "checkpoint current state of container (experimental)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Checkpoint) Usage() string {
+ return `checkpoint [flags] <container id> - save current state of container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Checkpoint) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&c.imagePath, "image-path", "", "directory path to saved container image")
+ f.BoolVar(&c.leaveRunning, "leave-running", false, "restart the container after checkpointing")
+
+ // Unimplemented flags necessary for compatibility with docker.
+ var wp string
+ f.StringVar(&wp, "work-path", "", "ignored")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ cont, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container: %v", err)
+ }
+
+ if c.imagePath == "" {
+ Fatalf("image-path flag must be provided")
+ }
+
+ if err := os.MkdirAll(c.imagePath, 0755); err != nil {
+ Fatalf("making directories at path provided: %v", err)
+ }
+
+ fullImagePath := filepath.Join(c.imagePath, checkpointFileName)
+
+ // Create the image file and open for writing.
+ file, err := os.OpenFile(fullImagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+ if err != nil {
+ Fatalf("os.OpenFile(%q) failed: %v", fullImagePath, err)
+ }
+ defer file.Close()
+
+ if err := cont.Checkpoint(file); err != nil {
+ Fatalf("checkpoint failed: %v", err)
+ }
+
+ if !c.leaveRunning {
+ return subcommands.ExitSuccess
+ }
+
+ // TODO(b/110843694): Make it possible to restore into same container.
+ // For now, we can fake it by destroying the container and making a
+ // new container with the same ID. This hack does not work with docker
+ // which uses the container pid to ensure that the restore-container is
+ // actually the same as the checkpoint-container. By restoring into
+ // the same container, we will solve the docker incompatibility.
+
+ // Restore into new container with same ID.
+ bundleDir := cont.BundleDir
+ if bundleDir == "" {
+ Fatalf("setting bundleDir")
+ }
+
+ spec, err := specutils.ReadSpec(bundleDir)
+ if err != nil {
+ Fatalf("reading spec: %v", err)
+ }
+
+ specutils.LogSpec(spec)
+
+ if cont.ConsoleSocket != "" {
+ log.Warningf("ignoring console socket since it cannot be restored")
+ }
+
+ if err := cont.Destroy(); err != nil {
+ Fatalf("destroying container: %v", err)
+ }
+
+ cont, err = container.Create(id, spec, conf, bundleDir, "", "", "")
+ if err != nil {
+ Fatalf("restoring container: %v", err)
+ }
+ defer cont.Destroy()
+
+ if err := cont.Restore(spec, conf, fullImagePath); err != nil {
+ Fatalf("starting container: %v", err)
+ }
+
+ ws, err := cont.Wait()
+ *waitStatus = ws
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
new file mode 100644
index 000000000..1a774db04
--- /dev/null
+++ b/runsc/cmd/chroot.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// mountInChroot creates the destination mount point in the given chroot and
+// mounts the source.
+func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
+ chrootDst := filepath.Join(chroot, dst)
+ log.Infof("Mounting %q at %q", src, chrootDst)
+
+ if err := specutils.Mount(src, chrootDst, typ, flags); err != nil {
+ return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err)
+ }
+ return nil
+}
+
+func pivotRoot(root string) error {
+ if err := os.Chdir(root); err != nil {
+ return fmt.Errorf("error changing working directory: %v", err)
+ }
+ // pivot_root(new_root, put_old) moves the root filesystem (old_root)
+ // of the calling process to the directory put_old and makes new_root
+ // the new root filesystem of the calling process.
+ //
+ // pivot_root(".", ".") makes a mount of the working directory the new
+ // root filesystem, so it will be moved in "/" and then the old_root
+ // will be moved to "/" too. The parent mount of the old_root will be
+ // new_root, so after umounting the old_root, we will see only
+ // the new_root in "/".
+ if err := syscall.PivotRoot(".", "."); err != nil {
+ return fmt.Errorf("error changing root filesystem: %v", err)
+ }
+
+ if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
+ return fmt.Errorf("error umounting the old root file system: %v", err)
+ }
+ return nil
+}
+
+// setUpChroot creates an empty directory with runsc mounted at /runsc and proc
+// mounted at /proc.
+func setUpChroot(pidns bool) error {
+ // We are a new mount namespace, so we can use /tmp as a directory to
+ // construct a new root.
+ chroot := os.TempDir()
+
+ log.Infof("Setting up sandbox chroot in %q", chroot)
+
+ // Convert all shared mounts into slave to be sure that nothing will be
+ // propagated outside of our namespace.
+ if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
+ return fmt.Errorf("error converting mounts: %v", err)
+ }
+
+ if err := syscall.Mount("runsc-root", chroot, "tmpfs", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC, ""); err != nil {
+ return fmt.Errorf("error mounting tmpfs in choot: %v", err)
+ }
+
+ if pidns {
+ flags := uint32(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC | syscall.MS_RDONLY)
+ if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil {
+ return fmt.Errorf("error mounting proc in chroot: %v", err)
+ }
+ } else {
+ if err := mountInChroot(chroot, "/proc", "/proc", "bind", syscall.MS_BIND|syscall.MS_RDONLY|syscall.MS_REC); err != nil {
+ return fmt.Errorf("error mounting proc in chroot: %v", err)
+ }
+ }
+
+ if err := syscall.Mount("", chroot, "", syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_BIND, ""); err != nil {
+ return fmt.Errorf("error remounting chroot in read-only: %v", err)
+ }
+
+ return pivotRoot(chroot)
+}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
new file mode 100644
index 000000000..a2fc377d1
--- /dev/null
+++ b/runsc/cmd/cmd.go
@@ -0,0 +1,117 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cmd holds implementations of the runsc commands.
+package cmd
+
+import (
+ "fmt"
+ "os"
+ "runtime"
+ "strconv"
+ "syscall"
+
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Errorf logs to stderr and returns subcommands.ExitFailure.
+func Errorf(s string, args ...interface{}) subcommands.ExitStatus {
+ // If runsc is being invoked by docker or cri-o, then we might not have
+ // access to stderr, so we log a serious-looking warning in addition to
+ // writing to stderr.
+ log.Warningf("FATAL ERROR: "+s, args...)
+ fmt.Fprintf(os.Stderr, s+"\n", args...)
+ // Return an error that is unlikely to be used by the application.
+ return subcommands.ExitFailure
+}
+
+// Fatalf logs to stderr and exits with a failure status code.
+func Fatalf(s string, args ...interface{}) {
+ Errorf(s, args...)
+ os.Exit(128)
+}
+
+// intFlags can be used with int flags that appear multiple times.
+type intFlags []int
+
+// String implements flag.Value.
+func (i *intFlags) String() string {
+ return fmt.Sprintf("%v", *i)
+}
+
+// Get implements flag.Value.
+func (i *intFlags) Get() interface{} {
+ return i
+}
+
+// GetArray returns array of FDs.
+func (i *intFlags) GetArray() []int {
+ return *i
+}
+
+// Set implements flag.Value.
+func (i *intFlags) Set(s string) error {
+ fd, err := strconv.Atoi(s)
+ if err != nil {
+ return fmt.Errorf("invalid flag value: %v", err)
+ }
+ if fd < 0 {
+ return fmt.Errorf("flag value must be greater than 0: %d", fd)
+ }
+ *i = append(*i, fd)
+ return nil
+}
+
+// setCapsAndCallSelf sets capabilities to the current thread and then execve's
+// itself again with the arguments specified in 'args' to restart the process
+// with the desired capabilities.
+func setCapsAndCallSelf(args []string, caps *specs.LinuxCapabilities) error {
+ // Keep thread locked while capabilities are changed.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ if err := applyCaps(caps); err != nil {
+ return fmt.Errorf("applyCaps() failed: %v", err)
+ }
+ binPath := specutils.ExePath
+
+ log.Infof("Execve %q again, bye!", binPath)
+ err := syscall.Exec(binPath, args, []string{})
+ return fmt.Errorf("error executing %s: %v", binPath, err)
+}
+
+// callSelfAsNobody sets UID and GID to nobody and then execve's itself again.
+func callSelfAsNobody(args []string) error {
+ // Keep thread locked while user/group are changed.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ const nobody = 65534
+
+ if _, _, err := syscall.RawSyscall(syscall.SYS_SETGID, uintptr(nobody), 0, 0); err != 0 {
+ return fmt.Errorf("error setting uid: %v", err)
+ }
+ if _, _, err := syscall.RawSyscall(syscall.SYS_SETUID, uintptr(nobody), 0, 0); err != 0 {
+ return fmt.Errorf("error setting gid: %v", err)
+ }
+
+ binPath := specutils.ExePath
+
+ log.Infof("Execve %q again, bye!", binPath)
+ err := syscall.Exec(binPath, args, []string{})
+ return fmt.Errorf("error executing %s: %v", binPath, err)
+}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
new file mode 100644
index 000000000..629c198fd
--- /dev/null
+++ b/runsc/cmd/create.go
@@ -0,0 +1,103 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Create implements subcommands.Command for the "create" command.
+type Create struct {
+ // bundleDir is the path to the bundle directory (defaults to the
+ // current working directory).
+ bundleDir string
+
+ // pidFile is the filename that the sandbox pid will be written to.
+ // This file should only be created once the container process inside
+ // the sandbox is ready to use.
+ pidFile string
+
+ // consoleSocket is the path to an AF_UNIX socket which will receive a
+ // file descriptor referencing the master end of the console's
+ // pseudoterminal. This is ignored unless spec.Process.Terminal is
+ // true.
+ consoleSocket string
+
+ // userLog is the path to send user-visible logs to. This log is different
+ // from debug logs. The former is meant to be consumed by the users and should
+ // contain only information that is relevant to the person running the
+ // container, e.g. unsuported syscalls, while the later is more verbose and
+ // consumed by developers.
+ userLog string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Create) Name() string {
+ return "create"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Create) Synopsis() string {
+ return "create a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Create) Usage() string {
+ return `create [flags] <container id> - create a secure container
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Create) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+ f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+ f.StringVar(&c.pidFile, "pid-file", "", "filename that the container pid will be written to")
+ f.StringVar(&c.userLog, "user-log", "", "filename to send user-visible logs to. Empty means no logging.")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ bundleDir := c.bundleDir
+ if bundleDir == "" {
+ bundleDir = getwdOrDie()
+ }
+ spec, err := specutils.ReadSpec(bundleDir)
+ if err != nil {
+ Fatalf("reading spec: %v", err)
+ }
+ specutils.LogSpec(spec)
+
+ // Create the container. A new sandbox will be created for the
+ // container unless the metadata specifies that it should be run in an
+ // existing container.
+ if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, c.userLog); err != nil {
+ Fatalf("creating container: %v", err)
+ }
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
new file mode 100644
index 000000000..27eb51172
--- /dev/null
+++ b/runsc/cmd/debug.go
@@ -0,0 +1,185 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "os"
+ "syscall"
+ "time"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Debug implements subcommands.Command for the "debug" command.
+type Debug struct {
+ pid int
+ stacks bool
+ signal int
+ profileHeap string
+ profileCPU string
+ profileDelay int
+ trace string
+}
+
+// Name implements subcommands.Command.
+func (*Debug) Name() string {
+ return "debug"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Debug) Synopsis() string {
+ return "shows a variety of debug information"
+}
+
+// Usage implements subcommands.Command.
+func (*Debug) Usage() string {
+ return `debug [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.
+func (d *Debug) SetFlags(f *flag.FlagSet) {
+ f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set")
+ f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
+ f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
+ f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
+ f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
+ f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
+ f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ var c *container.Container
+ conf := args[0].(*boot.Config)
+
+ if d.pid == 0 {
+ // No pid, container ID must have been provided.
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+ var err error
+ c, err = container.Load(conf.RootDir, f.Arg(0))
+ if err != nil {
+ Fatalf("loading container %q: %v", f.Arg(0), err)
+ }
+ } else {
+ if f.NArg() != 0 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+ // Go over all sandboxes and find the one that matches PID.
+ ids, err := container.List(conf.RootDir)
+ if err != nil {
+ Fatalf("listing containers: %v", err)
+ }
+ for _, id := range ids {
+ candidate, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container %q: %v", id, err)
+ }
+ if candidate.SandboxPid() == d.pid {
+ c = candidate
+ break
+ }
+ }
+ if c == nil {
+ Fatalf("container with PID %d not found", d.pid)
+ }
+ }
+
+ if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+ Fatalf("container sandbox is not running")
+ }
+ log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
+
+ if d.signal > 0 {
+ log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid)
+ if err := syscall.Kill(c.Sandbox.Pid, syscall.Signal(d.signal)); err != nil {
+ Fatalf("failed to send signal %d to processs %d", d.signal, c.Sandbox.Pid)
+ }
+ }
+ if d.stacks {
+ log.Infof("Retrieving sandbox stacks")
+ stacks, err := c.Sandbox.Stacks()
+ if err != nil {
+ Fatalf("retrieving stacks: %v", err)
+ }
+ log.Infof(" *** Stack dump ***\n%s", stacks)
+ }
+ if d.profileHeap != "" {
+ f, err := os.Create(d.profileHeap)
+ if err != nil {
+ Fatalf(err.Error())
+ }
+ defer f.Close()
+
+ if err := c.Sandbox.HeapProfile(f); err != nil {
+ Fatalf(err.Error())
+ }
+ log.Infof("Heap profile written to %q", d.profileHeap)
+ }
+
+ delay := false
+ if d.profileCPU != "" {
+ delay = true
+ f, err := os.Create(d.profileCPU)
+ if err != nil {
+ Fatalf(err.Error())
+ }
+ defer func() {
+ f.Close()
+ if err := c.Sandbox.StopCPUProfile(); err != nil {
+ Fatalf(err.Error())
+ }
+ log.Infof("CPU profile written to %q", d.profileCPU)
+ }()
+ if err := c.Sandbox.StartCPUProfile(f); err != nil {
+ Fatalf(err.Error())
+ }
+ log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
+ }
+ if d.trace != "" {
+ delay = true
+ f, err := os.Create(d.trace)
+ if err != nil {
+ Fatalf(err.Error())
+ }
+ defer func() {
+ f.Close()
+ if err := c.Sandbox.StopTrace(); err != nil {
+ Fatalf(err.Error())
+ }
+ log.Infof("Trace written to %q", d.trace)
+ }()
+ if err := c.Sandbox.StartTrace(f); err != nil {
+ Fatalf(err.Error())
+ }
+ log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace)
+
+ }
+
+ if delay {
+ time.Sleep(time.Duration(d.profileDelay) * time.Second)
+
+ }
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
new file mode 100644
index 000000000..9039723e9
--- /dev/null
+++ b/runsc/cmd/delete.go
@@ -0,0 +1,87 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "fmt"
+ "os"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Delete implements subcommands.Command for the "delete" command.
+type Delete struct {
+ // force indicates that the container should be terminated if running.
+ force bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Delete) Name() string {
+ return "delete"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Delete) Synopsis() string {
+ return "delete resources held by a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Delete) Usage() string {
+ return `delete [flags] <container ids>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (d *Delete) SetFlags(f *flag.FlagSet) {
+ f.BoolVar(&d.force, "force", false, "terminate container if running")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() == 0 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ conf := args[0].(*boot.Config)
+ if err := d.execute(f.Args(), conf); err != nil {
+ Fatalf("%v", err)
+ }
+ return subcommands.ExitSuccess
+}
+
+func (d *Delete) execute(ids []string, conf *boot.Config) error {
+ for _, id := range ids {
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ if os.IsNotExist(err) && d.force {
+ log.Warningf("couldn't find container %q: %v", id, err)
+ return nil
+ }
+ return fmt.Errorf("loading container %q: %v", id, err)
+ }
+ if !d.force && c.Status != container.Created && c.Status != container.Stopped {
+ return fmt.Errorf("cannot delete container that is not stopped without --force flag")
+ }
+ if err := c.Destroy(); err != nil {
+ return fmt.Errorf("destroying container: %v", err)
+ }
+ }
+ return nil
+}
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
new file mode 100644
index 000000000..8ea59046c
--- /dev/null
+++ b/runsc/cmd/do.go
@@ -0,0 +1,310 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "math/rand"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Do implements subcommands.Command for the "do" command. It sets up a simple
+// sandbox and executes the command inside it. See Usage() for more details.
+type Do struct {
+ root string
+ cwd string
+ ip string
+ networkNamespace bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Do) Name() string {
+ return "do"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Do) Synopsis() string {
+ return "Simplistic way to execute a command inside the sandbox. It's to be used for testing only."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Do) Usage() string {
+ return `do [flags] <cmd> - runs a command.
+
+This command starts a sandbox with host filesystem mounted inside as readonly,
+with a writable tmpfs overlay on top of it. The given command is executed inside
+the sandbox. It's to be used to quickly test applications without having to
+install or run docker. It doesn't give nearly as many options and it's to be
+used for testing only.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Do) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
+ f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory")
+ f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox")
+ f.BoolVar(&c.networkNamespace, "netns", true, "run in a new network namespace")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if len(f.Args()) == 0 {
+ c.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ // Map the entire host file system, but make it readonly with a writable
+ // overlay on top (ignore --overlay option).
+ conf.Overlay = true
+
+ hostname, err := os.Hostname()
+ if err != nil {
+ return Errorf("Error to retrieve hostname: %v", err)
+ }
+
+ absRoot, err := resolvePath(c.root)
+ if err != nil {
+ return Errorf("Error resolving root: %v", err)
+ }
+ absCwd, err := resolvePath(c.cwd)
+ if err != nil {
+ return Errorf("Error resolving current directory: %v", err)
+ }
+
+ spec := &specs.Spec{
+ Root: &specs.Root{
+ Path: absRoot,
+ },
+ Process: &specs.Process{
+ Cwd: absCwd,
+ Args: f.Args(),
+ Env: os.Environ(),
+ Capabilities: specutils.AllCapabilities(),
+ },
+ Hostname: hostname,
+ }
+
+ specutils.LogSpec(spec)
+
+ cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
+ if !c.networkNamespace {
+ if conf.Network != boot.NetworkHost {
+ Fatalf("The current network namespace can be used only if --network=host is set", nil)
+ }
+ } else if conf.Network != boot.NetworkNone {
+ clean, err := c.setupNet(cid, spec)
+ if err != nil {
+ return Errorf("Error setting up network: %v", err)
+ }
+ defer clean()
+ }
+
+ out, err := json.Marshal(spec)
+ if err != nil {
+ return Errorf("Error to marshal spec: %v", err)
+ }
+ tmpDir, err := ioutil.TempDir("", "runsc-do")
+ if err != nil {
+ return Errorf("Error to create tmp dir: %v", err)
+ }
+ defer os.RemoveAll(tmpDir)
+
+ log.Infof("Changing configuration RootDir to %q", tmpDir)
+ conf.RootDir = tmpDir
+
+ cfgPath := filepath.Join(tmpDir, "config.json")
+ if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil {
+ return Errorf("Error write spec: %v", err)
+ }
+
+ ws, err := container.Run(cid, spec, conf, tmpDir, "", "", "", false)
+ if err != nil {
+ return Errorf("running container: %v", err)
+ }
+
+ *waitStatus = ws
+ return subcommands.ExitSuccess
+}
+
+func resolvePath(path string) (string, error) {
+ var err error
+ path, err = filepath.Abs(path)
+ if err != nil {
+ return "", fmt.Errorf("resolving %q: %v", path, err)
+ }
+ path = filepath.Clean(path)
+ if err := syscall.Access(path, 0); err != nil {
+ return "", fmt.Errorf("unable to access %q: %v", path, err)
+ }
+ return path, nil
+}
+
+func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
+ dev, err := defaultDevice()
+ if err != nil {
+ return nil, err
+ }
+ peerIP, err := calculatePeerIP(c.ip)
+ if err != nil {
+ return nil, err
+ }
+ veth, peer := deviceNames(cid)
+
+ cmds := []string{
+ fmt.Sprintf("ip link add %s type veth peer name %s", veth, peer),
+
+ // Setup device outside the namespace.
+ fmt.Sprintf("ip addr add %s/24 dev %s", peerIP, peer),
+ fmt.Sprintf("ip link set %s up", peer),
+
+ // Setup device inside the namespace.
+ fmt.Sprintf("ip netns add %s", cid),
+ fmt.Sprintf("ip link set %s netns %s", veth, cid),
+ fmt.Sprintf("ip netns exec %s ip addr add %s/24 dev %s", cid, c.ip, veth),
+ fmt.Sprintf("ip netns exec %s ip link set %s up", cid, veth),
+ fmt.Sprintf("ip netns exec %s ip link set lo up", cid),
+ fmt.Sprintf("ip netns exec %s ip route add default via %s", cid, peerIP),
+
+ // Enable network access.
+ "sysctl -w net.ipv4.ip_forward=1",
+ fmt.Sprintf("iptables -t nat -A POSTROUTING -s %s -o %s -j MASQUERADE", c.ip, dev),
+ fmt.Sprintf("iptables -A FORWARD -i %s -o %s -j ACCEPT", dev, peer),
+ fmt.Sprintf("iptables -A FORWARD -o %s -i %s -j ACCEPT", dev, peer),
+ }
+
+ for _, cmd := range cmds {
+ log.Debugf("Run %q", cmd)
+ args := strings.Split(cmd, " ")
+ c := exec.Command(args[0], args[1:]...)
+ if err := c.Run(); err != nil {
+ return nil, fmt.Errorf("failed to run %q: %v", cmd, err)
+ }
+ }
+
+ if err := makeFile("/etc/resolv.conf", "nameserver 8.8.8.8\n", spec); err != nil {
+ return nil, err
+ }
+ if err := makeFile("/etc/hostname", cid+"\n", spec); err != nil {
+ return nil, err
+ }
+ hosts := fmt.Sprintf("127.0.0.1\tlocalhost\n%s\t%s\n", c.ip, cid)
+ if err := makeFile("/etc/hosts", hosts, spec); err != nil {
+ return nil, err
+ }
+
+ if spec.Linux == nil {
+ spec.Linux = &specs.Linux{}
+ }
+ netns := specs.LinuxNamespace{
+ Type: specs.NetworkNamespace,
+ Path: filepath.Join("/var/run/netns", cid),
+ }
+ spec.Linux.Namespaces = append(spec.Linux.Namespaces, netns)
+
+ return func() { c.cleanNet(cid, dev) }, nil
+}
+
+func (c *Do) cleanNet(cid, dev string) {
+ veth, peer := deviceNames(cid)
+
+ cmds := []string{
+ fmt.Sprintf("ip link delete %s", peer),
+ fmt.Sprintf("ip netns delete %s", cid),
+
+ fmt.Sprintf("iptables -t nat -D POSTROUTING -s %s/24 -o %s -j MASQUERADE", c.ip, dev),
+ fmt.Sprintf("iptables -D FORWARD -i %s -o %s -j ACCEPT", dev, veth),
+ fmt.Sprintf("iptables -D FORWARD -o %s -i %s -j ACCEPT", dev, veth),
+ }
+
+ for _, cmd := range cmds {
+ log.Debugf("Run %q", cmd)
+ args := strings.Split(cmd, " ")
+ c := exec.Command(args[0], args[1:]...)
+ if err := c.Run(); err != nil {
+ log.Warningf("Failed to run %q: %v", cmd, err)
+ }
+ }
+}
+
+func deviceNames(cid string) (string, string) {
+ // Device name is limited to 15 letters.
+ return "ve-" + cid, "vp-" + cid
+
+}
+
+func defaultDevice() (string, error) {
+ out, err := exec.Command("ip", "route", "list", "default").CombinedOutput()
+ if err != nil {
+ return "", err
+ }
+ parts := strings.Split(string(out), " ")
+ if len(parts) < 5 {
+ return "", fmt.Errorf("malformed %q output: %q", "ip route list default", string(out))
+ }
+ return parts[4], nil
+}
+
+func makeFile(dest, content string, spec *specs.Spec) error {
+ tmpFile, err := ioutil.TempFile("", filepath.Base(dest))
+ if err != nil {
+ return err
+ }
+ if _, err := tmpFile.WriteString(content); err != nil {
+ return err
+ }
+ spec.Mounts = append(spec.Mounts, specs.Mount{
+ Source: tmpFile.Name(),
+ Destination: dest,
+ Type: "bind",
+ Options: []string{"ro"},
+ })
+ return nil
+}
+
+func calculatePeerIP(ip string) (string, error) {
+ parts := strings.Split(ip, ".")
+ if len(parts) != 4 {
+ return "", fmt.Errorf("invalid IP format %q", ip)
+ }
+ n, err := strconv.Atoi(parts[3])
+ if err != nil {
+ return "", fmt.Errorf("invalid IP format %q: %v", ip, err)
+ }
+ n++
+ if n > 255 {
+ n = 1
+ }
+ return fmt.Sprintf("%s.%s.%s.%d", parts[0], parts[1], parts[2], n), nil
+}
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
new file mode 100644
index 000000000..c6bc8fc3a
--- /dev/null
+++ b/runsc/cmd/events.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/json"
+ "os"
+ "time"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Events implements subcommands.Command for the "events" command.
+type Events struct {
+ // The interval between stats reporting.
+ intervalSec int
+ // If true, events will print a single group of stats and exit.
+ stats bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Events) Name() string {
+ return "events"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Events) Synopsis() string {
+ return "display container events such as OOM notifications, cpu, memory, and IO usage statistics"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Events) Usage() string {
+ return `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+The events command displays information about the container. By default the
+information is displayed once every 5 seconds.
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (evs *Events) SetFlags(f *flag.FlagSet) {
+ f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds")
+ f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading sandbox: %v", err)
+ }
+
+ // Repeatedly get stats from the container.
+ for {
+ // Get the event and print it as JSON.
+ ev, err := c.Event()
+ if err != nil {
+ log.Warningf("Error getting events for container: %v", err)
+ }
+ // err must be preserved because it is used below when breaking
+ // out of the loop.
+ b, err := json.Marshal(ev)
+ if err != nil {
+ log.Warningf("Error while marshalling event %v: %v", ev, err)
+ } else {
+ os.Stdout.Write(b)
+ }
+
+ // If we're only running once, break. If we're only running
+ // once and there was an error, the command failed.
+ if evs.stats {
+ if err != nil {
+ return subcommands.ExitFailure
+ }
+ break
+ }
+
+ time.Sleep(time.Duration(evs.intervalSec) * time.Second)
+ }
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
new file mode 100644
index 000000000..52fd7ac4b
--- /dev/null
+++ b/runsc/cmd/exec.go
@@ -0,0 +1,486 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/console"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const privateClearStatusFlag = "private-clear-status"
+
+// Exec implements subcommands.Command for the "exec" command.
+type Exec struct {
+ cwd string
+ env stringSlice
+ // user contains the UID and GID with which to run the new process.
+ user user
+ extraKGIDs stringSlice
+ caps stringSlice
+ detach bool
+ clearStatus bool
+ processPath string
+ pidFile string
+ internalPidFile string
+
+ // consoleSocket is the path to an AF_UNIX socket which will receive a
+ // file descriptor referencing the master end of the console's
+ // pseudoterminal.
+ consoleSocket string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Exec) Name() string {
+ return "exec"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Exec) Synopsis() string {
+ return "execute new process inside the container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Exec) Usage() string {
+ return `exec [command options] <container-id> <command> [command options] || --process process.json <container-id>
+
+
+Where "<container-id>" is the name for the instance of the container and
+"<command>" is the command to be executed in the container.
+"<command>" can't be empty unless a "-process" flag provided.
+
+EXAMPLE:
+If the container is configured to run /bin/ps the following will
+output a list of processes running in the container:
+
+ # runc exec <container-id> ps
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ex *Exec) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&ex.cwd, "cwd", "", "current working directory")
+ f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')")
+ f.Var(&ex.user, "user", "UID (format: <uid>[:<gid>])")
+ f.Var(&ex.extraKGIDs, "additional-gids", "additional gids")
+ f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process")
+ f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
+ f.StringVar(&ex.processPath, "process", "", "path to the process.json")
+ f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
+ f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
+ f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+
+ // This flag clears the status of the exec'd process upon completion. It is
+ // only used when we fork due to --detach being set on the parent.
+ f.BoolVar(&ex.clearStatus, privateClearStatusFlag, true, "private flag, do not use")
+}
+
+// Execute implements subcommands.Command.Execute. It starts a process in an
+// already created container.
+func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ e, id, err := ex.parseArgs(f)
+ if err != nil {
+ Fatalf("parsing process spec: %v", err)
+ }
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading sandbox: %v", err)
+ }
+
+ // Replace empty settings with defaults from container.
+ if e.WorkingDirectory == "" {
+ e.WorkingDirectory = c.Spec.Process.Cwd
+ }
+ if e.Envv == nil {
+ e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env)
+ if err != nil {
+ Fatalf("getting environment variables: %v", err)
+ }
+ }
+ if e.Capabilities == nil {
+ // enableRaw is set to true to prevent the filtering out of
+ // CAP_NET_RAW. This is the opposite of Create() because exec
+ // requires the capability to be set explicitly, while 'docker
+ // run' sets it by default.
+ e.Capabilities, err = specutils.Capabilities(true /* enableRaw */, c.Spec.Process.Capabilities)
+ if err != nil {
+ Fatalf("creating capabilities: %v", err)
+ }
+ }
+
+ // containerd expects an actual process to represent the container being
+ // executed. If detach was specified, starts a child in non-detach mode,
+ // write the child's PID to the pid file. So when the container returns, the
+ // child process will also return and signal containerd.
+ if ex.detach {
+ return ex.execAndWait(waitStatus)
+ }
+
+ // Start the new process and get it pid.
+ pid, err := c.Execute(e)
+ if err != nil {
+ Fatalf("getting processes for container: %v", err)
+ }
+
+ if e.StdioIsPty {
+ // Forward signals sent to this process to the foreground
+ // process in the sandbox.
+ stopForwarding := c.ForwardSignals(pid, true /* fgProcess */)
+ defer stopForwarding()
+ }
+
+ // Write the sandbox-internal pid if required.
+ if ex.internalPidFile != "" {
+ pidStr := []byte(strconv.Itoa(int(pid)))
+ if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
+ Fatalf("writing internal pid file %q: %v", ex.internalPidFile, err)
+ }
+ }
+
+ // Generate the pid file after the internal pid file is generated, so that users
+ // can safely assume that the internal pid file is ready after `runsc exec -d`
+ // returns.
+ if ex.pidFile != "" {
+ if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
+ Fatalf("writing pid file: %v", err)
+ }
+ }
+
+ // Wait for the process to exit.
+ ws, err := c.WaitPID(pid, ex.clearStatus)
+ if err != nil {
+ Fatalf("waiting on pid %d: %v", pid, err)
+ }
+ *waitStatus = ws
+ return subcommands.ExitSuccess
+}
+
+func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
+ binPath := specutils.ExePath
+ var args []string
+
+ // The command needs to write a pid file so that execAndWait can tell
+ // when it has started. If no pid-file was provided, we should use a
+ // filename in a temp directory.
+ pidFile := ex.pidFile
+ if pidFile == "" {
+ tmpDir, err := ioutil.TempDir("", "exec-pid-")
+ if err != nil {
+ Fatalf("creating TempDir: %v", err)
+ }
+ defer os.RemoveAll(tmpDir)
+ pidFile = filepath.Join(tmpDir, "pid")
+ args = append(args, "--pid-file="+pidFile)
+ }
+
+ // Add the rest of the args, excluding the "detach" flag.
+ for _, a := range os.Args[1:] {
+ if strings.Contains(a, "detach") {
+ // Replace with the "private-clear-status" flag, which tells
+ // the new process it's a detached child and shouldn't
+ // clear the exit status of the sentry process.
+ args = append(args, fmt.Sprintf("--%s=false", privateClearStatusFlag))
+ } else {
+ args = append(args, a)
+ }
+ }
+
+ cmd := exec.Command(binPath, args...)
+ cmd.Args[0] = "runsc-exec"
+
+ // Exec stdio defaults to current process stdio.
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+
+ // If the console control socket file is provided, then create a new
+ // pty master/slave pair and set the TTY on the sandbox process.
+ if ex.consoleSocket != "" {
+ // Create a new TTY pair and send the master on the provided
+ // socket.
+ tty, err := console.NewWithSocket(ex.consoleSocket)
+ if err != nil {
+ Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err)
+ }
+ defer tty.Close()
+
+ // Set stdio to the new TTY slave.
+ cmd.Stdin = tty
+ cmd.Stdout = tty
+ cmd.Stderr = tty
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Setsid: true,
+ Setctty: true,
+ Ctty: int(tty.Fd()),
+ }
+ }
+
+ if err := cmd.Start(); err != nil {
+ Fatalf("failure to start child exec process, err: %v", err)
+ }
+
+ log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+
+ // Wait for PID file to ensure that child process has started. Otherwise,
+ // '--process' file is deleted as soon as this process returns and the child
+ // may fail to read it.
+ ready := func() (bool, error) {
+ pidb, err := ioutil.ReadFile(pidFile)
+ if err == nil {
+ // File appeared, check whether pid is fully written.
+ pid, err := strconv.Atoi(string(pidb))
+ if err != nil {
+ return false, nil
+ }
+ return pid == cmd.Process.Pid, nil
+ }
+ if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
+ return false, err
+ }
+ // No file yet, continue to wait...
+ return false, nil
+ }
+ if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil {
+ Fatalf("unexpected error waiting for PID file, err: %v", err)
+ }
+
+ *waitStatus = 0
+ return subcommands.ExitSuccess
+}
+
+// parseArgs parses exec information from the command line or a JSON file
+// depending on whether the --process flag was used. Returns an ExecArgs and
+// the ID of the container to be used.
+func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
+ if ex.processPath == "" {
+ // Requires at least a container ID and command.
+ if f.NArg() < 2 {
+ f.Usage()
+ return nil, "", fmt.Errorf("both a container-id and command are required")
+ }
+ e, err := ex.argsFromCLI(f.Args()[1:])
+ return e, f.Arg(0), err
+ }
+ // Requires only the container ID.
+ if f.NArg() != 1 {
+ f.Usage()
+ return nil, "", fmt.Errorf("a container-id is required")
+ }
+ e, err := ex.argsFromProcessFile()
+ return e, f.Arg(0), err
+}
+
+func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
+ extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
+ for _, s := range ex.extraKGIDs {
+ kgid, err := strconv.Atoi(s)
+ if err != nil {
+ Fatalf("parsing GID: %s, %v", s, err)
+ }
+ extraKGIDs = append(extraKGIDs, auth.KGID(kgid))
+ }
+
+ var caps *auth.TaskCapabilities
+ if len(ex.caps) > 0 {
+ var err error
+ caps, err = capabilities(ex.caps)
+ if err != nil {
+ return nil, fmt.Errorf("capabilities error: %v", err)
+ }
+ }
+
+ return &control.ExecArgs{
+ Argv: argv,
+ WorkingDirectory: ex.cwd,
+ KUID: ex.user.kuid,
+ KGID: ex.user.kgid,
+ ExtraKGIDs: extraKGIDs,
+ Capabilities: caps,
+ StdioIsPty: ex.consoleSocket != "",
+ FilePayload: urpc.FilePayload{[]*os.File{os.Stdin, os.Stdout, os.Stderr}},
+ }, nil
+}
+
+func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
+ f, err := os.Open(ex.processPath)
+ if err != nil {
+ return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
+ }
+ defer f.Close()
+ var p specs.Process
+ if err := json.NewDecoder(f).Decode(&p); err != nil {
+ return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
+ }
+ return argsFromProcess(&p)
+}
+
+// argsFromProcess performs all the non-IO conversion from the Process struct
+// to ExecArgs.
+func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
+ // Create capabilities.
+ var caps *auth.TaskCapabilities
+ if p.Capabilities != nil {
+ var err error
+ // enableRaw is set to true to prevent the filtering out of
+ // CAP_NET_RAW. This is the opposite of Create() because exec
+ // requires the capability to be set explicitly, while 'docker
+ // run' sets it by default.
+ caps, err = specutils.Capabilities(true /* enableRaw */, p.Capabilities)
+ if err != nil {
+ return nil, fmt.Errorf("error creating capabilities: %v", err)
+ }
+ }
+
+ // Convert the spec's additional GIDs to KGIDs.
+ extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids))
+ for _, GID := range p.User.AdditionalGids {
+ extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+ }
+
+ return &control.ExecArgs{
+ Argv: p.Args,
+ Envv: p.Env,
+ WorkingDirectory: p.Cwd,
+ KUID: auth.KUID(p.User.UID),
+ KGID: auth.KGID(p.User.GID),
+ ExtraKGIDs: extraKGIDs,
+ Capabilities: caps,
+ StdioIsPty: p.Terminal,
+ FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+ }, nil
+}
+
+// resolveEnvs transforms lists of environment variables into a single list of
+// environment variables. If a variable is defined multiple times, the last
+// value is used.
+func resolveEnvs(envs ...[]string) ([]string, error) {
+ // First create a map of variable names to values. This removes any
+ // duplicates.
+ envMap := make(map[string]string)
+ for _, env := range envs {
+ for _, str := range env {
+ parts := strings.SplitN(str, "=", 2)
+ if len(parts) != 2 {
+ return nil, fmt.Errorf("invalid variable: %s", str)
+ }
+ envMap[parts[0]] = parts[1]
+ }
+ }
+ // Reassemble envMap into a list of environment variables of the form
+ // NAME=VALUE.
+ env := make([]string, 0, len(envMap))
+ for k, v := range envMap {
+ env = append(env, fmt.Sprintf("%s=%s", k, v))
+ }
+ return env, nil
+}
+
+// capabilities takes a list of capabilities as strings and returns an
+// auth.TaskCapabilities struct with those capabilities in every capability set.
+// This mimics runc's behavior.
+func capabilities(cs []string) (*auth.TaskCapabilities, error) {
+ var specCaps specs.LinuxCapabilities
+ for _, cap := range cs {
+ specCaps.Ambient = append(specCaps.Ambient, cap)
+ specCaps.Bounding = append(specCaps.Bounding, cap)
+ specCaps.Effective = append(specCaps.Effective, cap)
+ specCaps.Inheritable = append(specCaps.Inheritable, cap)
+ specCaps.Permitted = append(specCaps.Permitted, cap)
+ }
+ // enableRaw is set to true to prevent the filtering out of
+ // CAP_NET_RAW. This is the opposite of Create() because exec requires
+ // the capability to be set explicitly, while 'docker run' sets it by
+ // default.
+ return specutils.Capabilities(true /* enableRaw */, &specCaps)
+}
+
+// stringSlice allows a flag to be used multiple times, where each occurrence
+// adds a value to the flag. For example, a flag called "x" could be invoked
+// via "runsc exec -x foo -x bar", and the corresponding stringSlice would be
+// {"x", "y"}.
+type stringSlice []string
+
+// String implements flag.Value.String.
+func (ss *stringSlice) String() string {
+ return fmt.Sprintf("%v", *ss)
+}
+
+// Get implements flag.Value.Get.
+func (ss *stringSlice) Get() interface{} {
+ return ss
+}
+
+// Set implements flag.Value.Set.
+func (ss *stringSlice) Set(s string) error {
+ *ss = append(*ss, s)
+ return nil
+}
+
+// user allows -user to convey a UID and, optionally, a GID separated by a
+// colon.
+type user struct {
+ kuid auth.KUID
+ kgid auth.KGID
+}
+
+func (u *user) String() string {
+ return fmt.Sprintf("%+v", *u)
+}
+
+func (u *user) Get() interface{} {
+ return u
+}
+
+func (u *user) Set(s string) error {
+ parts := strings.SplitN(s, ":", 2)
+ kuid, err := strconv.Atoi(parts[0])
+ if err != nil {
+ return fmt.Errorf("couldn't parse UID: %s", parts[0])
+ }
+ u.kuid = auth.KUID(kuid)
+ if len(parts) > 1 {
+ kgid, err := strconv.Atoi(parts[1])
+ if err != nil {
+ return fmt.Errorf("couldn't parse GID: %s", parts[1])
+ }
+ u.kgid = auth.KGID(kgid)
+ }
+ return nil
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
new file mode 100644
index 000000000..bccb29397
--- /dev/null
+++ b/runsc/cmd/gofer.go
@@ -0,0 +1,446 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+ "sync"
+ "syscall"
+
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/p9"
+ "gvisor.googlesource.com/gvisor/pkg/unet"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/fsgofer"
+ "gvisor.googlesource.com/gvisor/runsc/fsgofer/filter"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+var caps = []string{
+ "CAP_CHOWN",
+ "CAP_DAC_OVERRIDE",
+ "CAP_DAC_READ_SEARCH",
+ "CAP_FOWNER",
+ "CAP_FSETID",
+ "CAP_SYS_CHROOT",
+}
+
+// goferCaps is the minimal set of capabilities needed by the Gofer to operate
+// on files.
+var goferCaps = &specs.LinuxCapabilities{
+ Bounding: caps,
+ Effective: caps,
+ Permitted: caps,
+}
+
+// Gofer implements subcommands.Command for the "gofer" command, which starts a
+// filesystem gofer. This command should not be called directly.
+type Gofer struct {
+ bundleDir string
+ ioFDs intFlags
+ applyCaps bool
+ setUpRoot bool
+
+ panicOnWrite bool
+ specFD int
+ mountsFD int
+}
+
+// Name implements subcommands.Command.
+func (*Gofer) Name() string {
+ return "gofer"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Gofer) Synopsis() string {
+ return "launch a gofer process that serves files over 9P protocol (internal use only)"
+}
+
+// Usage implements subcommands.Command.
+func (*Gofer) Usage() string {
+ return `gofer [flags]`
+}
+
+// SetFlags implements subcommands.Command.
+func (g *Gofer) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+ f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
+ f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
+ f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
+ f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
+ f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
+ f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
+}
+
+// Execute implements subcommands.Command.
+func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ specFile := os.NewFile(uintptr(g.specFD), "spec file")
+ defer specFile.Close()
+ spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
+ if err != nil {
+ Fatalf("reading spec: %v", err)
+ }
+
+ conf := args[0].(*boot.Config)
+
+ if g.setUpRoot {
+ if err := setupRootFS(spec, conf); err != nil {
+ Fatalf("Error setting up root FS: %v", err)
+ }
+ }
+ if g.applyCaps {
+ // Disable caps when calling myself again.
+ // Note: minimal argument handling for the default case to keep it simple.
+ args := os.Args
+ args = append(args, "--apply-caps=false", "--setup-root=false")
+ if err := setCapsAndCallSelf(args, goferCaps); err != nil {
+ Fatalf("Unable to apply caps: %v", err)
+ }
+ panic("unreachable")
+ }
+
+ // Find what path is going to be served by this gofer.
+ root := spec.Root.Path
+ if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+ root = "/root"
+ }
+
+ // Resolve mount points paths, then replace mounts from our spec and send the
+ // mount list over to the sandbox, so they are both in sync.
+ //
+ // Note that all mount points have been mounted in the proper location in
+ // setupRootFS().
+ cleanMounts, err := resolveMounts(spec.Mounts, root)
+ if err != nil {
+ Fatalf("Failure to resolve mounts: %v", err)
+ }
+ spec.Mounts = cleanMounts
+ go func() {
+ if err := g.writeMounts(cleanMounts); err != nil {
+ panic(fmt.Sprintf("Failed to write mounts: %v", err))
+ }
+ }()
+
+ specutils.LogSpec(spec)
+
+ // fsgofer should run with a umask of 0, because we want to preserve file
+ // modes exactly as sent by the sandbox, which will have applied its own umask.
+ syscall.Umask(0)
+
+ if err := syscall.Chroot(root); err != nil {
+ Fatalf("failed to chroot to %q: %v", root, err)
+ }
+ if err := syscall.Chdir("/"); err != nil {
+ Fatalf("changing working dir: %v", err)
+ }
+ log.Infof("Process chroot'd to %q", root)
+
+ // Start with root mount, then add any other additional mount as needed.
+ ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
+ ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
+ ROMount: spec.Root.Readonly,
+ PanicOnWrite: g.panicOnWrite,
+ })
+ if err != nil {
+ Fatalf("creating attach point: %v", err)
+ }
+ ats = append(ats, ap)
+ log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly)
+
+ mountIdx := 1 // first one is the root
+ for _, m := range spec.Mounts {
+ if specutils.Is9PMount(m) {
+ cfg := fsgofer.Config{
+ ROMount: isReadonlyMount(m.Options),
+ PanicOnWrite: g.panicOnWrite,
+ }
+ ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
+ if err != nil {
+ Fatalf("creating attach point: %v", err)
+ }
+ ats = append(ats, ap)
+
+ if mountIdx >= len(g.ioFDs) {
+ Fatalf("no FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
+ }
+ log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfg.ROMount)
+ mountIdx++
+ }
+ }
+ if mountIdx != len(g.ioFDs) {
+ Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
+ }
+
+ if err := filter.Install(); err != nil {
+ Fatalf("installing seccomp filters: %v", err)
+ }
+
+ runServers(ats, g.ioFDs)
+ return subcommands.ExitSuccess
+}
+
+func runServers(ats []p9.Attacher, ioFDs []int) {
+ // Run the loops and wait for all to exit.
+ var wg sync.WaitGroup
+ for i, ioFD := range ioFDs {
+ wg.Add(1)
+ go func(ioFD int, at p9.Attacher) {
+ socket, err := unet.NewSocket(ioFD)
+ if err != nil {
+ Fatalf("creating server on FD %d: %v", ioFD, err)
+ }
+ s := p9.NewServer(at)
+ if err := s.Handle(socket); err != nil {
+ Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
+ }
+ wg.Done()
+ }(ioFD, ats[i])
+ }
+ wg.Wait()
+ log.Infof("All 9P servers exited.")
+}
+
+func (g *Gofer) writeMounts(mounts []specs.Mount) error {
+ bytes, err := json.Marshal(mounts)
+ if err != nil {
+ return err
+ }
+
+ f := os.NewFile(uintptr(g.mountsFD), "mounts file")
+ defer f.Close()
+
+ for written := 0; written < len(bytes); {
+ w, err := f.Write(bytes[written:])
+ if err != nil {
+ return err
+ }
+ written += w
+ }
+ return nil
+}
+
+func isReadonlyMount(opts []string) bool {
+ for _, o := range opts {
+ if o == "ro" {
+ return true
+ }
+ }
+ return false
+}
+
+func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
+ // Convert all shared mounts into slaves to be sure that nothing will be
+ // propagated outside of our namespace.
+ if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
+ Fatalf("error converting mounts: %v", err)
+ }
+
+ root := spec.Root.Path
+ if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+ // FIXME: runsc can't be re-executed without
+ // /proc, so we create a tmpfs mount, mount ./proc and ./root
+ // there, then move this mount to the root and after
+ // setCapsAndCallSelf, runsc will chroot into /root.
+ //
+ // We need a directory to construct a new root and we know that
+ // runsc can't start without /proc, so we can use it for this.
+ flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC)
+ if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil {
+ Fatalf("error mounting tmpfs: %v", err)
+ }
+
+ // Prepare tree structure for pivot_root(2).
+ os.Mkdir("/proc/proc", 0755)
+ os.Mkdir("/proc/root", 0755)
+ if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil {
+ Fatalf("error mounting proc: %v", err)
+ }
+ root = "/proc/root"
+ }
+
+ // Mount root path followed by submounts.
+ if err := syscall.Mount(spec.Root.Path, root, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
+ return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
+ }
+
+ flags := uint32(syscall.MS_SLAVE | syscall.MS_REC)
+ if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
+ flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
+ }
+ if err := syscall.Mount("", root, "", uintptr(flags), ""); err != nil {
+ return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
+ }
+
+ // Replace the current spec, with the clean spec with symlinks resolved.
+ if err := setupMounts(spec.Mounts, root); err != nil {
+ Fatalf("error setting up FS: %v", err)
+ }
+
+ // Create working directory if needed.
+ if spec.Process.Cwd != "" {
+ dst, err := resolveSymlinks(root, spec.Process.Cwd)
+ if err != nil {
+ return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
+ }
+ if err := os.MkdirAll(dst, 0755); err != nil {
+ return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
+ }
+ }
+
+ // Check if root needs to be remounted as readonly.
+ if spec.Root.Readonly {
+ // If root is a mount point but not read-only, we can change mount options
+ // to make it read-only for extra safety.
+ log.Infof("Remounting root as readonly: %q", root)
+ flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
+ if err := syscall.Mount(root, root, "bind", flags, ""); err != nil {
+ return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
+ }
+ }
+
+ if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+ if err := pivotRoot("/proc"); err != nil {
+ Fatalf("faild to change the root file system: %v", err)
+ }
+ if err := os.Chdir("/"); err != nil {
+ Fatalf("failed to change working directory")
+ }
+ }
+ return nil
+}
+
+// setupMounts binds mount all mounts specified in the spec in their correct
+// location inside root. It will resolve relative paths and symlinks. It also
+// creates directories as needed.
+func setupMounts(mounts []specs.Mount, root string) error {
+ for _, m := range mounts {
+ if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+ continue
+ }
+
+ dst, err := resolveSymlinks(root, m.Destination)
+ if err != nil {
+ return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
+ }
+
+ flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND
+ log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
+ if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
+ return fmt.Errorf("mounting %v: %v", m, err)
+ }
+
+ // Set propagation options that cannot be set together with other options.
+ flags = specutils.PropOptionsToFlags(m.Options)
+ if flags != 0 {
+ if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
+ return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err)
+ }
+ }
+ }
+ return nil
+}
+
+// resolveMounts resolved relative paths and symlinks to mount points.
+//
+// Note: mount points must already be in place for resolution to work.
+// Otherwise, it may follow symlinks to locations that would be overwritten
+// with another mount point and return the wrong location. In short, make sure
+// setupMounts() has been called before.
+func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) {
+ cleanMounts := make([]specs.Mount, 0, len(mounts))
+ for _, m := range mounts {
+ if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+ cleanMounts = append(cleanMounts, m)
+ continue
+ }
+ dst, err := resolveSymlinks(root, m.Destination)
+ if err != nil {
+ return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
+ }
+ relDst, err := filepath.Rel(root, dst)
+ if err != nil {
+ panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
+ }
+ cpy := m
+ cpy.Destination = filepath.Join("/", relDst)
+ cleanMounts = append(cleanMounts, cpy)
+ }
+ return cleanMounts, nil
+}
+
+// ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are
+// symlinks, they are evaluated relative to 'root' to ensure the end result is
+// the same as if the process was running inside the container.
+func resolveSymlinks(root, rel string) (string, error) {
+ return resolveSymlinksImpl(root, root, rel, 255)
+}
+
+func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
+ if followCount == 0 {
+ return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
+ }
+
+ rel = filepath.Clean(rel)
+ for _, name := range strings.Split(rel, string(filepath.Separator)) {
+ if name == "" {
+ continue
+ }
+ // Note that Join() resolves things like ".." and returns a clean path.
+ path := filepath.Join(base, name)
+ if !strings.HasPrefix(path, root) {
+ // One cannot '..' their way out of root.
+ path = root
+ continue
+ }
+ fi, err := os.Lstat(path)
+ if err != nil {
+ if !os.IsNotExist(err) {
+ return "", err
+ }
+ // Not found means there is no symlink to check. Just keep walking dirs.
+ base = path
+ continue
+ }
+ if fi.Mode()&os.ModeSymlink != 0 {
+ link, err := os.Readlink(path)
+ if err != nil {
+ return "", err
+ }
+ if filepath.IsAbs(link) {
+ base = root
+ }
+ base, err = resolveSymlinksImpl(root, base, link, followCount-1)
+ if err != nil {
+ return "", err
+ }
+ continue
+ }
+ base = path
+ }
+ return base, nil
+}
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
new file mode 100644
index 000000000..aed5f3291
--- /dev/null
+++ b/runsc/cmd/kill.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "fmt"
+ "strconv"
+ "strings"
+ "syscall"
+
+ "flag"
+ "github.com/google/subcommands"
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Kill implements subcommands.Command for the "kill" command.
+type Kill struct {
+ all bool
+ pid int
+}
+
+// Name implements subcommands.Command.Name.
+func (*Kill) Name() string {
+ return "kill"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Kill) Synopsis() string {
+ return "sends a signal to the container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Kill) Usage() string {
+ return `kill <container id> [signal]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (k *Kill) SetFlags(f *flag.FlagSet) {
+ f.BoolVar(&k.all, "all", false, "send the specified signal to all processes inside the container")
+ f.IntVar(&k.pid, "pid", 0, "send the specified signal to a specific process")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() == 0 || f.NArg() > 2 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ if k.pid != 0 && k.all {
+ Fatalf("it is invalid to specify both --all and --pid")
+ }
+
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container: %v", err)
+ }
+
+ // The OCI command-line spec says that the signal should be specified
+ // via a flag, but runc (and things that call runc) pass it as an
+ // argument.
+ signal := f.Arg(1)
+ if signal == "" {
+ signal = "TERM"
+ }
+
+ sig, err := parseSignal(signal)
+ if err != nil {
+ Fatalf("%v", err)
+ }
+
+ if k.pid != 0 {
+ if err := c.SignalProcess(sig, int32(k.pid)); err != nil {
+ Fatalf("failed to signal pid %d: %v", k.pid, err)
+ }
+ } else {
+ if err := c.SignalContainer(sig, k.all); err != nil {
+ Fatalf("%v", err)
+ }
+ }
+ return subcommands.ExitSuccess
+}
+
+func parseSignal(s string) (syscall.Signal, error) {
+ n, err := strconv.Atoi(s)
+ if err == nil {
+ sig := syscall.Signal(n)
+ for _, msig := range signalMap {
+ if sig == msig {
+ return sig, nil
+ }
+ }
+ return -1, fmt.Errorf("unknown signal %q", s)
+ }
+ if sig, ok := signalMap[strings.TrimPrefix(strings.ToUpper(s), "SIG")]; ok {
+ return sig, nil
+ }
+ return -1, fmt.Errorf("unknown signal %q", s)
+}
+
+var signalMap = map[string]syscall.Signal{
+ "ABRT": unix.SIGABRT,
+ "ALRM": unix.SIGALRM,
+ "BUS": unix.SIGBUS,
+ "CHLD": unix.SIGCHLD,
+ "CLD": unix.SIGCLD,
+ "CONT": unix.SIGCONT,
+ "FPE": unix.SIGFPE,
+ "HUP": unix.SIGHUP,
+ "ILL": unix.SIGILL,
+ "INT": unix.SIGINT,
+ "IO": unix.SIGIO,
+ "IOT": unix.SIGIOT,
+ "KILL": unix.SIGKILL,
+ "PIPE": unix.SIGPIPE,
+ "POLL": unix.SIGPOLL,
+ "PROF": unix.SIGPROF,
+ "PWR": unix.SIGPWR,
+ "QUIT": unix.SIGQUIT,
+ "SEGV": unix.SIGSEGV,
+ "STKFLT": unix.SIGSTKFLT,
+ "STOP": unix.SIGSTOP,
+ "SYS": unix.SIGSYS,
+ "TERM": unix.SIGTERM,
+ "TRAP": unix.SIGTRAP,
+ "TSTP": unix.SIGTSTP,
+ "TTIN": unix.SIGTTIN,
+ "TTOU": unix.SIGTTOU,
+ "URG": unix.SIGURG,
+ "USR1": unix.SIGUSR1,
+ "USR2": unix.SIGUSR2,
+ "VTALRM": unix.SIGVTALRM,
+ "WINCH": unix.SIGWINCH,
+ "XCPU": unix.SIGXCPU,
+ "XFSZ": unix.SIGXFSZ,
+}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
new file mode 100644
index 000000000..1f5ca2473
--- /dev/null
+++ b/runsc/cmd/list.go
@@ -0,0 +1,117 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "os"
+ "text/tabwriter"
+ "time"
+
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// List implements subcommands.Command for the "list" command for the "list" command.
+type List struct {
+ quiet bool
+ format string
+}
+
+// Name implements subcommands.command.name.
+func (*List) Name() string {
+ return "list"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*List) Synopsis() string {
+ return "list containers started by runsc with the given root"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*List) Usage() string {
+ return `list [flags]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (l *List) SetFlags(f *flag.FlagSet) {
+ f.BoolVar(&l.quiet, "quiet", false, "only list container ids")
+ f.StringVar(&l.format, "format", "text", "output format: 'text' (default) or 'json'")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 0 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ conf := args[0].(*boot.Config)
+ ids, err := container.List(conf.RootDir)
+ if err != nil {
+ Fatalf("%v", err)
+ }
+
+ if l.quiet {
+ for _, id := range ids {
+ fmt.Println(id)
+ }
+ return subcommands.ExitSuccess
+ }
+
+ // Collect the containers.
+ var containers []*container.Container
+ for _, id := range ids {
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container %q: %v", id, err)
+ }
+ containers = append(containers, c)
+ }
+
+ switch l.format {
+ case "text":
+ // Print a nice table.
+ w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
+ fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
+ for _, c := range containers {
+ fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
+ c.ID,
+ c.SandboxPid(),
+ c.Status,
+ c.BundleDir,
+ c.CreatedAt.Format(time.RFC3339Nano),
+ c.Owner)
+ }
+ w.Flush()
+ case "json":
+ // Print just the states.
+ var states []specs.State
+ for _, c := range containers {
+ states = append(states, c.State())
+ }
+ if err := json.NewEncoder(os.Stdout).Encode(states); err != nil {
+ Fatalf("marshaling container state: %v", err)
+ }
+ default:
+ Fatalf("unknown list format %q", l.format)
+ }
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
new file mode 100644
index 000000000..0e9ef7fa5
--- /dev/null
+++ b/runsc/cmd/path.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "os"
+)
+
+// getwdOrDie returns the current working directory and dies if it cannot.
+func getwdOrDie() string {
+ wd, err := os.Getwd()
+ if err != nil {
+ Fatalf("getting current working directory: %v", err)
+ }
+ return wd
+}
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
new file mode 100644
index 000000000..11b36aa10
--- /dev/null
+++ b/runsc/cmd/pause.go
@@ -0,0 +1,68 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Pause implements subcommands.Command for the "pause" command.
+type Pause struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Pause) Name() string {
+ return "pause"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Pause) Synopsis() string {
+ return "pause suspends all processes in a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Pause) Usage() string {
+ return `pause <container id> - pause process in instance of container.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Pause) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ cont, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container: %v", err)
+ }
+
+ if err := cont.Pause(); err != nil {
+ Fatalf("pause failed: %v", err)
+ }
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
new file mode 100644
index 000000000..3a3e6f17a
--- /dev/null
+++ b/runsc/cmd/ps.go
@@ -0,0 +1,86 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "fmt"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// PS implements subcommands.Command for the "ps" command.
+type PS struct {
+ format string
+}
+
+// Name implements subcommands.Command.Name.
+func (*PS) Name() string {
+ return "ps"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*PS) Synopsis() string {
+ return "ps displays the processes running inside a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*PS) Usage() string {
+ return "<container-id> [ps options]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ps *PS) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&ps.format, "format", "table", "output format. Select one of: table or json (default: table)")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading sandbox: %v", err)
+ }
+ pList, err := c.Processes()
+ if err != nil {
+ Fatalf("getting processes for container: %v", err)
+ }
+
+ switch ps.format {
+ case "table":
+ fmt.Println(control.ProcessListToTable(pList))
+ case "json":
+ o, err := control.PrintPIDsJSON(pList)
+ if err != nil {
+ Fatalf("generating JSON: %v", err)
+ }
+ fmt.Println(o)
+ default:
+ Fatalf("unsupported format: %s", ps.format)
+ }
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
new file mode 100644
index 000000000..3ab2f5676
--- /dev/null
+++ b/runsc/cmd/restore.go
@@ -0,0 +1,106 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "path/filepath"
+ "syscall"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Restore implements subcommands.Command for the "restore" command.
+type Restore struct {
+ // Restore flags are a super-set of those for Create.
+ Create
+
+ // imagePath is the path to the saved container image
+ imagePath string
+
+ // detach indicates that runsc has to start a process and exit without waiting it.
+ detach bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Restore) Name() string {
+ return "restore"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Restore) Synopsis() string {
+ return "restore a saved state of container (experimental)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Restore) Usage() string {
+ return `restore [flags] <container id> - restore saved state of container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Restore) SetFlags(f *flag.FlagSet) {
+ r.Create.SetFlags(f)
+ f.StringVar(&r.imagePath, "image-path", "", "directory path to saved container image")
+ f.BoolVar(&r.detach, "detach", false, "detach from the container's process")
+
+ // Unimplemented flags necessary for compatibility with docker.
+
+ var nsr bool
+ f.BoolVar(&nsr, "no-subreaper", false, "ignored")
+
+ var wp string
+ f.StringVar(&wp, "work-path", "", "ignored")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ bundleDir := r.bundleDir
+ if bundleDir == "" {
+ bundleDir = getwdOrDie()
+ }
+ spec, err := specutils.ReadSpec(bundleDir)
+ if err != nil {
+ Fatalf("reading spec: %v", err)
+ }
+ specutils.LogSpec(spec)
+
+ if r.imagePath == "" {
+ Fatalf("image-path flag must be provided")
+ }
+
+ conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName)
+
+ ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
+ if err != nil {
+ Fatalf("running container: %v", err)
+ }
+ *waitStatus = ws
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
new file mode 100644
index 000000000..9a2ade41e
--- /dev/null
+++ b/runsc/cmd/resume.go
@@ -0,0 +1,69 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Resume implements subcommands.Command for the "resume" command.
+type Resume struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Resume) Name() string {
+ return "resume"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Resume) Synopsis() string {
+ return "Resume unpauses a paused container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Resume) Usage() string {
+ return `resume <container id> - resume a paused container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Resume) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ cont, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container: %v", err)
+ }
+
+ if err := cont.Resume(); err != nil {
+ Fatalf("resume failed: %v", err)
+ }
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
new file mode 100644
index 000000000..c228b4f93
--- /dev/null
+++ b/runsc/cmd/run.go
@@ -0,0 +1,87 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "syscall"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Run implements subcommands.Command for the "run" command.
+type Run struct {
+ // Run flags are a super-set of those for Create.
+ Create
+
+ // detach indicates that runsc has to start a process and exit without waiting it.
+ detach bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Run) Name() string {
+ return "run"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Run) Synopsis() string {
+ return "create and run a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Run) Usage() string {
+ return `run [flags] <container id> - create and run a secure container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Run) SetFlags(f *flag.FlagSet) {
+ f.BoolVar(&r.detach, "detach", false, "detach from the container's process")
+ r.Create.SetFlags(f)
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ bundleDir := r.bundleDir
+ if bundleDir == "" {
+ bundleDir = getwdOrDie()
+ }
+ spec, err := specutils.ReadSpec(bundleDir)
+ if err != nil {
+ Fatalf("reading spec: %v", err)
+ }
+ specutils.LogSpec(spec)
+
+ ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
+ if err != nil {
+ Fatalf("running container: %v", err)
+ }
+
+ *waitStatus = ws
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
new file mode 100644
index 000000000..344da13ba
--- /dev/null
+++ b/runsc/cmd/spec.go
@@ -0,0 +1,182 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+
+ "flag"
+ "github.com/google/subcommands"
+)
+
+var specTemplate = []byte(`{
+ "ociVersion": "1.0.0",
+ "process": {
+ "terminal": true,
+ "user": {
+ "uid": 0,
+ "gid": 0
+ },
+ "args": [
+ "sh"
+ ],
+ "env": [
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "TERM=xterm"
+ ],
+ "cwd": "/",
+ "capabilities": {
+ "bounding": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ],
+ "effective": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ],
+ "inheritable": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ],
+ "permitted": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ],
+ "ambient": [
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE"
+ ]
+ },
+ "rlimits": [
+ {
+ "type": "RLIMIT_NOFILE",
+ "hard": 1024,
+ "soft": 1024
+ }
+ ]
+ },
+ "root": {
+ "path": "rootfs",
+ "readonly": true
+ },
+ "hostname": "runsc",
+ "mounts": [
+ {
+ "destination": "/proc",
+ "type": "proc",
+ "source": "proc"
+ },
+ {
+ "destination": "/dev",
+ "type": "tmpfs",
+ "source": "tmpfs",
+ "options": []
+ },
+ {
+ "destination": "/sys",
+ "type": "sysfs",
+ "source": "sysfs",
+ "options": [
+ "nosuid",
+ "noexec",
+ "nodev",
+ "ro"
+ ]
+ }
+ ],
+ "linux": {
+ "namespaces": [
+ {
+ "type": "pid"
+ },
+ {
+ "type": "network"
+ },
+ {
+ "type": "ipc"
+ },
+ {
+ "type": "uts"
+ },
+ {
+ "type": "mount"
+ }
+ ]
+ }
+}`)
+
+// Spec implements subcommands.Command for the "spec" command.
+type Spec struct {
+ bundle string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Spec) Name() string {
+ return "spec"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Spec) Synopsis() string {
+ return "create a new OCI bundle specification file"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Spec) Usage() string {
+ return `spec [options] - create a new OCI bundle specification file.
+
+The spec command creates a new specification file (config.json) for a new OCI bundle.
+
+The specification file is a starter file that runs the "sh" command in the container. You
+should edit the file to suit your needs. You can find out more about the format of the
+specification file by visiting the OCI runtime spec repository:
+https://github.com/opencontainers/runtime-spec/
+
+EXAMPLE:
+ $ mkdir -p bundle/rootfs
+ $ cd bundle
+ $ runsc spec
+ $ docker export $(docker create hello-world) | tar -xf - -C rootfs
+ $ sed -i 's;"sh";"/hello";' config.json
+ $ sudo runsc run hello
+
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (s *Spec) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ confPath := filepath.Join(s.bundle, "config.json")
+ if _, err := os.Stat(confPath); !os.IsNotExist(err) {
+ Fatalf("file %q already exists", confPath)
+ }
+
+ if err := ioutil.WriteFile(confPath, specTemplate, 0664); err != nil {
+ Fatalf("writing to %q: %v", confPath, err)
+ }
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
new file mode 100644
index 000000000..657726251
--- /dev/null
+++ b/runsc/cmd/start.go
@@ -0,0 +1,65 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// Start implements subcommands.Command for the "start" command.
+type Start struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Start) Name() string {
+ return "start"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Start) Synopsis() string {
+ return "start a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Start) Usage() string {
+ return `start <container id> - start a secure container.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Start) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container: %v", err)
+ }
+ if err := c.Start(conf); err != nil {
+ Fatalf("starting container: %v", err)
+ }
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
new file mode 100644
index 000000000..f0d449b19
--- /dev/null
+++ b/runsc/cmd/state.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/json"
+ "os"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+// State implements subcommands.Command for the "state" command.
+type State struct{}
+
+// Name implements subcommands.Command.Name.
+func (*State) Name() string {
+ return "state"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*State) Synopsis() string {
+ return "get the state of a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*State) Usage() string {
+ return `state [flags] <container id> - get the state of a container`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*State) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container: %v", err)
+ }
+ log.Debugf("Returning state for container %+v", c)
+
+ state := c.State()
+ log.Debugf("State: %+v", state)
+
+ // Write json-encoded state directly to stdout.
+ b, err := json.MarshalIndent(state, "", " ")
+ if err != nil {
+ Fatalf("marshaling container state: %v", err)
+ }
+ os.Stdout.Write(b)
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
new file mode 100644
index 000000000..a55a682f3
--- /dev/null
+++ b/runsc/cmd/wait.go
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/json"
+ "os"
+ "syscall"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/container"
+)
+
+const (
+ unsetPID = -1
+)
+
+// Wait implements subcommands.Command for the "wait" command.
+type Wait struct {
+ rootPID int
+ pid int
+}
+
+// Name implements subcommands.Command.Name.
+func (*Wait) Name() string {
+ return "wait"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Wait) Synopsis() string {
+ return "wait on a process inside a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Wait) Usage() string {
+ return `wait [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (wt *Wait) SetFlags(f *flag.FlagSet) {
+ f.IntVar(&wt.rootPID, "rootpid", unsetPID, "select a PID in the sandbox root PID namespace to wait on instead of the container's root process")
+ f.IntVar(&wt.pid, "pid", unsetPID, "select a PID in the container's PID namespace to wait on instead of the container's root process")
+}
+
+// Execute implements subcommands.Command.Execute. It waits for a process in a
+// container to exit before returning.
+func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+ // You can't specify both -pid and -rootpid.
+ if wt.rootPID != unsetPID && wt.pid != unsetPID {
+ Fatalf("only one of -pid and -rootPid can be set")
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ c, err := container.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("loading container: %v", err)
+ }
+
+ var waitStatus syscall.WaitStatus
+ switch {
+ // Wait on the whole container.
+ case wt.rootPID == unsetPID && wt.pid == unsetPID:
+ ws, err := c.Wait()
+ if err != nil {
+ Fatalf("waiting on container %q: %v", c.ID, err)
+ }
+ waitStatus = ws
+ // Wait on a PID in the root PID namespace.
+ case wt.rootPID != unsetPID:
+ ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */)
+ if err != nil {
+ Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
+ }
+ waitStatus = ws
+ // Wait on a PID in the container's PID namespace.
+ case wt.pid != unsetPID:
+ ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */)
+ if err != nil {
+ Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
+ }
+ waitStatus = ws
+ }
+ result := waitResult{
+ ID: id,
+ ExitStatus: exitStatus(waitStatus),
+ }
+ // Write json-encoded wait result directly to stdout.
+ if err := json.NewEncoder(os.Stdout).Encode(result); err != nil {
+ Fatalf("marshaling wait result: %v", err)
+ }
+ return subcommands.ExitSuccess
+}
+
+type waitResult struct {
+ ID string `json:"id"`
+ ExitStatus int `json:"exitStatus"`
+}
+
+// exitStatus returns the correct exit status for a process based on if it
+// was signaled or exited cleanly.
+func exitStatus(status syscall.WaitStatus) int {
+ if status.Signaled() {
+ return 128 + int(status.Signal())
+ }
+ return status.ExitStatus()
+}
diff --git a/runsc/console/console.go b/runsc/console/console.go
new file mode 100644
index 000000000..64b23639a
--- /dev/null
+++ b/runsc/console/console.go
@@ -0,0 +1,63 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package console contains utilities for working with pty consols in runsc.
+package console
+
+import (
+ "fmt"
+ "net"
+ "os"
+
+ "github.com/kr/pty"
+ "golang.org/x/sys/unix"
+)
+
+// NewWithSocket creates pty master/slave pair, sends the master FD over the given
+// socket, and returns the slave.
+func NewWithSocket(socketPath string) (*os.File, error) {
+ // Create a new pty master and slave.
+ ptyMaster, ptySlave, err := pty.Open()
+ if err != nil {
+ return nil, fmt.Errorf("opening pty: %v", err)
+ }
+ defer ptyMaster.Close()
+
+ // Get a connection to the socket path.
+ conn, err := net.Dial("unix", socketPath)
+ if err != nil {
+ ptySlave.Close()
+ return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err)
+ }
+ defer conn.Close()
+ uc, ok := conn.(*net.UnixConn)
+ if !ok {
+ ptySlave.Close()
+ return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
+ }
+ socket, err := uc.File()
+ if err != nil {
+ ptySlave.Close()
+ return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err)
+ }
+ defer socket.Close()
+
+ // Send the master FD over the connection.
+ msg := unix.UnixRights(int(ptyMaster.Fd()))
+ if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
+ ptySlave.Close()
+ return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err)
+ }
+ return ptySlave, nil
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
new file mode 100644
index 000000000..513085836
--- /dev/null
+++ b/runsc/container/container.go
@@ -0,0 +1,1053 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package container creates and manipulates containers.
+package container
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "os/signal"
+ "path/filepath"
+ "regexp"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/cenkalti/backoff"
+ "github.com/gofrs/flock"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/cgroup"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ // metadataFilename is the name of the metadata file relative to the
+ // container root directory that holds sandbox metadata.
+ metadataFilename = "meta.json"
+
+ // metadataLockFilename is the name of a lock file in the container
+ // root directory that is used to prevent concurrent modifications to
+ // the container state and metadata.
+ metadataLockFilename = "meta.lock"
+)
+
+// validateID validates the container id.
+func validateID(id string) error {
+ // See libcontainer/factory_linux.go.
+ idRegex := regexp.MustCompile(`^[\w+-\.]+$`)
+ if !idRegex.MatchString(id) {
+ return fmt.Errorf("invalid container id: %v", id)
+ }
+ return nil
+}
+
+// Container represents a containerized application. When running, the
+// container is associated with a single Sandbox.
+//
+// Container metadata can be saved and loaded to disk. Within a root directory,
+// we maintain subdirectories for each container named with the container id.
+// The container metadata is stored as a json within the container directory
+// in a file named "meta.json". This metadata format is defined by us and is
+// not part of the OCI spec.
+//
+// Containers must write their metadata files after any change to their internal
+// states. The entire container directory is deleted when the container is
+// destroyed.
+//
+// When the container is stopped, all processes that belong to the container
+// must be stopped before Destroy() returns. containerd makes roughly the
+// following calls to stop a container:
+// - First it attempts to kill the container process with
+// 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a
+// separate thread, it's waiting on the container. As soon as the wait
+// returns, it moves on to the next step:
+// - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to
+// the container. 'kill --all SIGKILL' waits for all processes before
+// returning.
+// - Containerd waits for stdin, stdout and stderr to drain and be closed.
+// - It calls 'runsc delete'. runc implementation kills --all SIGKILL once
+// again just to be sure, waits, and then proceeds with remaining teardown.
+//
+type Container struct {
+ // ID is the container ID.
+ ID string `json:"id"`
+
+ // Spec is the OCI runtime spec that configures this container.
+ Spec *specs.Spec `json:"spec"`
+
+ // BundleDir is the directory containing the container bundle.
+ BundleDir string `json:"bundleDir"`
+
+ // Root is the directory containing the container metadata file. If this
+ // container is the root container, Root and RootContainerDir will be the
+ // same.
+ Root string `json:"root"`
+
+ // CreatedAt is the time the container was created.
+ CreatedAt time.Time `json:"createdAt"`
+
+ // Owner is the container owner.
+ Owner string `json:"owner"`
+
+ // ConsoleSocket is the path to a unix domain socket that will receive
+ // the console FD.
+ ConsoleSocket string `json:"consoleSocket"`
+
+ // Status is the current container Status.
+ Status Status `json:"status"`
+
+ // GoferPid is the PID of the gofer running along side the sandbox. May
+ // be 0 if the gofer has been killed.
+ GoferPid int `json:"goferPid"`
+
+ // goferIsChild is set if a gofer process is a child of the current process.
+ //
+ // This field isn't saved to json, because only a creator of a gofer
+ // process will have it as a child process.
+ goferIsChild bool
+
+ // Sandbox is the sandbox this container is running in. It's set when the
+ // container is created and reset when the sandbox is destroyed.
+ Sandbox *sandbox.Sandbox `json:"sandbox"`
+
+ // RootContainerDir is the root directory containing the metadata file of the
+ // sandbox root container. It's used to lock in order to serialize creating
+ // and deleting this Container's metadata directory. If this container is the
+ // root container, this is the same as Root.
+ RootContainerDir string
+}
+
+// Load loads a container with the given id from a metadata file. id may be an
+// abbreviation of the full container id, in which case Load loads the
+// container to which id unambiguously refers to.
+// Returns ErrNotExist if container doesn't exist.
+func Load(rootDir, id string) (*Container, error) {
+ log.Debugf("Load container %q %q", rootDir, id)
+ if err := validateID(id); err != nil {
+ return nil, fmt.Errorf("validating id: %v", err)
+ }
+
+ cRoot, err := findContainerRoot(rootDir, id)
+ if err != nil {
+ // Preserve error so that callers can distinguish 'not found' errors.
+ return nil, err
+ }
+
+ // Lock the container metadata to prevent other runsc instances from
+ // writing to it while we are reading it.
+ unlock, err := lockContainerMetadata(cRoot)
+ if err != nil {
+ return nil, err
+ }
+ defer unlock()
+
+ // Read the container metadata file and create a new Container from it.
+ metaFile := filepath.Join(cRoot, metadataFilename)
+ metaBytes, err := ioutil.ReadFile(metaFile)
+ if err != nil {
+ if os.IsNotExist(err) {
+ // Preserve error so that callers can distinguish 'not found' errors.
+ return nil, err
+ }
+ return nil, fmt.Errorf("reading container metadata file %q: %v", metaFile, err)
+ }
+ var c Container
+ if err := json.Unmarshal(metaBytes, &c); err != nil {
+ return nil, fmt.Errorf("unmarshaling container metadata from %q: %v", metaFile, err)
+ }
+
+ // If the status is "Running" or "Created", check that the sandbox
+ // process still exists, and set it to Stopped if it does not.
+ //
+ // This is inherently racey.
+ if c.Status == Running || c.Status == Created {
+ // Check if the sandbox process is still running.
+ if !c.isSandboxRunning() {
+ // Sandbox no longer exists, so this container definitely does not exist.
+ c.changeStatus(Stopped)
+ } else if c.Status == Running {
+ // Container state should reflect the actual state of the application, so
+ // we don't consider gofer process here.
+ if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
+ c.changeStatus(Stopped)
+ }
+ }
+ }
+
+ return &c, nil
+}
+
+func findContainerRoot(rootDir, partialID string) (string, error) {
+ // Check whether the id fully specifies an existing container.
+ cRoot := filepath.Join(rootDir, partialID)
+ if _, err := os.Stat(cRoot); err == nil {
+ return cRoot, nil
+ }
+
+ // Now see whether id could be an abbreviation of exactly 1 of the
+ // container ids. If id is ambigious (it could match more than 1
+ // container), it is an error.
+ cRoot = ""
+ ids, err := List(rootDir)
+ if err != nil {
+ return "", err
+ }
+ for _, id := range ids {
+ if strings.HasPrefix(id, partialID) {
+ if cRoot != "" {
+ return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, cRoot, id)
+ }
+ cRoot = id
+ }
+ }
+ if cRoot == "" {
+ return "", os.ErrNotExist
+ }
+ log.Debugf("abbreviated id %q resolves to full id %q", partialID, cRoot)
+ return filepath.Join(rootDir, cRoot), nil
+}
+
+// List returns all container ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+ log.Debugf("List containers %q", rootDir)
+ fs, err := ioutil.ReadDir(rootDir)
+ if err != nil {
+ return nil, fmt.Errorf("reading dir %q: %v", rootDir, err)
+ }
+ var out []string
+ for _, f := range fs {
+ out = append(out, f.Name())
+ }
+ return out, nil
+}
+
+// Create creates the container in a new Sandbox process, unless the metadata
+// indicates that an existing Sandbox should be used. The caller must call
+// Destroy() on the container.
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile, userLog string) (*Container, error) {
+ log.Debugf("Create container %q in root dir: %s", id, conf.RootDir)
+ if err := validateID(id); err != nil {
+ return nil, err
+ }
+
+ unlockRoot, err := maybeLockRootContainer(spec, conf.RootDir)
+ if err != nil {
+ return nil, err
+ }
+ defer unlockRoot()
+
+ // Lock the container metadata file to prevent concurrent creations of
+ // containers with the same id.
+ containerRoot := filepath.Join(conf.RootDir, id)
+ unlock, err := lockContainerMetadata(containerRoot)
+ if err != nil {
+ return nil, err
+ }
+ defer unlock()
+
+ // Check if the container already exists by looking for the metadata
+ // file.
+ if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil {
+ return nil, fmt.Errorf("container with id %q already exists", id)
+ } else if !os.IsNotExist(err) {
+ return nil, fmt.Errorf("looking for existing container in %q: %v", containerRoot, err)
+ }
+
+ c := &Container{
+ ID: id,
+ Spec: spec,
+ ConsoleSocket: consoleSocket,
+ BundleDir: bundleDir,
+ Root: containerRoot,
+ Status: Creating,
+ CreatedAt: time.Now(),
+ Owner: os.Getenv("USER"),
+ RootContainerDir: conf.RootDir,
+ }
+ // The Cleanup object cleans up partially created containers when an error occurs.
+ // Any errors occuring during cleanup itself are ignored.
+ cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+ defer cu.Clean()
+
+ // If the metadata annotations indicate that this container should be
+ // started in an existing sandbox, we must do so. The metadata will
+ // indicate the ID of the sandbox, which is the same as the ID of the
+ // init container in the sandbox.
+ if isRoot(spec) {
+ log.Debugf("Creating new sandbox for container %q", id)
+
+ // Create and join cgroup before processes are created to ensure they are
+ // part of the cgroup from the start (and all tneir children processes).
+ cg, err := cgroup.New(spec)
+ if err != nil {
+ return nil, err
+ }
+ if cg != nil {
+ // If there is cgroup config, install it before creating sandbox process.
+ if err := cg.Install(spec.Linux.Resources); err != nil {
+ return nil, fmt.Errorf("configuring cgroup: %v", err)
+ }
+ }
+ if err := runInCgroup(cg, func() error {
+ ioFiles, specFile, err := c.createGoferProcess(spec, conf, bundleDir)
+ if err != nil {
+ return err
+ }
+
+ // Start a new sandbox for this container. Any errors after this point
+ // must destroy the container.
+ c.Sandbox, err = sandbox.New(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles, specFile, cg)
+ return err
+ }); err != nil {
+ return nil, err
+ }
+ } else {
+ // This is sort of confusing. For a sandbox with a root
+ // container and a child container in it, runsc sees:
+ // * A container struct whose sandbox ID is equal to the
+ // container ID. This is the root container that is tied to
+ // the creation of the sandbox.
+ // * A container struct whose sandbox ID is equal to the above
+ // container/sandbox ID, but that has a different container
+ // ID. This is the child container.
+ sbid, ok := specutils.SandboxID(spec)
+ if !ok {
+ return nil, fmt.Errorf("no sandbox ID found when creating container")
+ }
+ log.Debugf("Creating new container %q in sandbox %q", c.ID, sbid)
+
+ // Find the sandbox associated with this ID.
+ sb, err := Load(conf.RootDir, sbid)
+ if err != nil {
+ return nil, err
+ }
+ c.Sandbox = sb.Sandbox
+ if err := c.Sandbox.CreateContainer(c.ID); err != nil {
+ return nil, err
+ }
+ }
+ c.changeStatus(Created)
+
+ // Save the metadata file.
+ if err := c.save(); err != nil {
+ return nil, err
+ }
+
+ // Write the PID file. Containerd considers the create complete after
+ // this file is created, so it must be the last thing we do.
+ if pidFile != "" {
+ if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil {
+ return nil, fmt.Errorf("error writing PID file: %v", err)
+ }
+ }
+
+ cu.Release()
+ return c, nil
+}
+
+// Start starts running the containerized process inside the sandbox.
+func (c *Container) Start(conf *boot.Config) error {
+ log.Debugf("Start container %q", c.ID)
+
+ unlockRoot, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
+ if err != nil {
+ return err
+ }
+ defer unlockRoot()
+
+ unlock, err := c.lock()
+ if err != nil {
+ return err
+ }
+ defer unlock()
+ if err := c.requireStatus("start", Created); err != nil {
+ return err
+ }
+
+ // "If any prestart hook fails, the runtime MUST generate an error,
+ // stop and destroy the container" -OCI spec.
+ if c.Spec.Hooks != nil {
+ if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
+ return err
+ }
+ }
+
+ if isRoot(c.Spec) {
+ if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
+ return err
+ }
+ } else {
+ // Join cgroup to strt gofer process to ensure it's part of the cgroup from
+ // the start (and all tneir children processes).
+ if err := runInCgroup(c.Sandbox.Cgroup, func() error {
+ // Create the gofer process.
+ ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+ if err != nil {
+ return err
+ }
+ defer mountsFile.Close()
+
+ cleanMounts, err := specutils.ReadMounts(mountsFile)
+ if err != nil {
+ return fmt.Errorf("reading mounts file: %v", err)
+ }
+ c.Spec.Mounts = cleanMounts
+
+ return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles)
+ }); err != nil {
+ return err
+ }
+ }
+
+ // "If any poststart hook fails, the runtime MUST log a warning, but
+ // the remaining hooks and lifecycle continue as if the hook had
+ // succeeded" -OCI spec.
+ if c.Spec.Hooks != nil {
+ executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
+ }
+
+ c.changeStatus(Running)
+ return c.save()
+}
+
+// Restore takes a container and replaces its kernel and file system
+// to restore a container from its state file.
+func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
+ log.Debugf("Restore container %q", c.ID)
+ unlock, err := c.lock()
+ if err != nil {
+ return err
+ }
+ defer unlock()
+
+ if err := c.requireStatus("restore", Created); err != nil {
+ return err
+ }
+
+ // "If any prestart hook fails, the runtime MUST generate an error,
+ // stop and destroy the container" -OCI spec.
+ if c.Spec.Hooks != nil {
+ if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
+ return err
+ }
+ }
+
+ if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil {
+ return err
+ }
+ c.changeStatus(Running)
+ return c.save()
+}
+
+// Run is a helper that calls Create + Start + Wait.
+func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile, userLog string, detach bool) (syscall.WaitStatus, error) {
+ log.Debugf("Run container %q in root dir: %s", id, conf.RootDir)
+ c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, userLog)
+ if err != nil {
+ return 0, fmt.Errorf("creating container: %v", err)
+ }
+ // Clean up partially created container if an error ocurrs.
+ // Any errors returned by Destroy() itself are ignored.
+ cu := specutils.MakeCleanup(func() {
+ c.Destroy()
+ })
+ defer cu.Clean()
+
+ if conf.RestoreFile != "" {
+ log.Debugf("Restore: %v", conf.RestoreFile)
+ if err := c.Restore(spec, conf, conf.RestoreFile); err != nil {
+ return 0, fmt.Errorf("starting container: %v", err)
+ }
+ } else {
+ if err := c.Start(conf); err != nil {
+ return 0, fmt.Errorf("starting container: %v", err)
+ }
+ }
+ if detach {
+ cu.Release()
+ return 0, nil
+ }
+ return c.Wait()
+}
+
+// Execute runs the specified command in the container. It returns the PID of
+// the newly created process.
+func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
+ log.Debugf("Execute in container %q, args: %+v", c.ID, args)
+ if err := c.requireStatus("execute in", Created, Running); err != nil {
+ return 0, err
+ }
+ args.ContainerID = c.ID
+ return c.Sandbox.Execute(args)
+}
+
+// Event returns events for the container.
+func (c *Container) Event() (*boot.Event, error) {
+ log.Debugf("Getting events for container %q", c.ID)
+ if err := c.requireStatus("get events for", Created, Running, Paused); err != nil {
+ return nil, err
+ }
+ return c.Sandbox.Event(c.ID)
+}
+
+// SandboxPid returns the Pid of the sandbox the container is running in, or -1 if the
+// container is not running.
+func (c *Container) SandboxPid() int {
+ if err := c.requireStatus("get PID", Created, Running, Paused); err != nil {
+ return -1
+ }
+ return c.Sandbox.Pid
+}
+
+// Wait waits for the container to exit, and returns its WaitStatus.
+// Call to wait on a stopped container is needed to retrieve the exit status
+// and wait returns immediately.
+func (c *Container) Wait() (syscall.WaitStatus, error) {
+ log.Debugf("Wait on container %q", c.ID)
+ return c.Sandbox.Wait(c.ID)
+}
+
+// WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
+// returns its WaitStatus.
+func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+ log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
+ if !c.isSandboxRunning() {
+ return 0, fmt.Errorf("sandbox is not running")
+ }
+ return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
+}
+
+// WaitPID waits for process 'pid' in the container's PID namespace and returns
+// its WaitStatus.
+func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+ log.Debugf("Wait on PID %d in container %q", pid, c.ID)
+ if !c.isSandboxRunning() {
+ return 0, fmt.Errorf("sandbox is not running")
+ }
+ return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
+}
+
+// SignalContainer sends the signal to the container. If all is true and signal
+// is SIGKILL, then waits for all processes to exit before returning.
+// SignalContainer returns an error if the container is already stopped.
+// TODO(b/113680494): Distinguish different error types.
+func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
+ log.Debugf("Signal container %q: %v", c.ID, sig)
+ // Signaling container in Stopped state is allowed. When all=false,
+ // an error will be returned anyway; when all=true, this allows
+ // sending signal to other processes inside the container even
+ // after the init process exits. This is especially useful for
+ // container cleanup.
+ if err := c.requireStatus("signal", Running, Stopped); err != nil {
+ return err
+ }
+ if !c.isSandboxRunning() {
+ return fmt.Errorf("sandbox is not running")
+ }
+ return c.Sandbox.SignalContainer(c.ID, sig, all)
+}
+
+// SignalProcess sends sig to a specific process in the container.
+func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error {
+ log.Debugf("Signal process %d in container %q: %v", pid, c.ID, sig)
+ if err := c.requireStatus("signal a process inside", Running); err != nil {
+ return err
+ }
+ if !c.isSandboxRunning() {
+ return fmt.Errorf("sandbox is not running")
+ }
+ return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
+}
+
+// ForwardSignals forwards all signals received by the current process to the
+// container process inside the sandbox. It returns a function that will stop
+// forwarding signals.
+func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() {
+ log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+ sigCh := make(chan os.Signal, 1)
+ signal.Notify(sigCh)
+ go func() {
+ for s := range sigCh {
+ log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", s, c.ID, pid, fgProcess)
+ if err := c.Sandbox.SignalProcess(c.ID, pid, s.(syscall.Signal), fgProcess); err != nil {
+ log.Warningf("error forwarding signal %d to container %q: %v", s, c.ID, err)
+ }
+ }
+ log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+ }()
+
+ return func() {
+ signal.Stop(sigCh)
+ close(sigCh)
+ }
+}
+
+// Checkpoint sends the checkpoint call to the container.
+// The statefile will be written to f, the file at the specified image-path.
+func (c *Container) Checkpoint(f *os.File) error {
+ log.Debugf("Checkpoint container %q", c.ID)
+ if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil {
+ return err
+ }
+ return c.Sandbox.Checkpoint(c.ID, f)
+}
+
+// Pause suspends the container and its kernel.
+// The call only succeeds if the container's status is created or running.
+func (c *Container) Pause() error {
+ log.Debugf("Pausing container %q", c.ID)
+ unlock, err := c.lock()
+ if err != nil {
+ return err
+ }
+ defer unlock()
+
+ if c.Status != Created && c.Status != Running {
+ return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
+ }
+
+ if err := c.Sandbox.Pause(c.ID); err != nil {
+ return fmt.Errorf("pausing container: %v", err)
+ }
+ c.changeStatus(Paused)
+ return c.save()
+}
+
+// Resume unpauses the container and its kernel.
+// The call only succeeds if the container's status is paused.
+func (c *Container) Resume() error {
+ log.Debugf("Resuming container %q", c.ID)
+ unlock, err := c.lock()
+ if err != nil {
+ return err
+ }
+ defer unlock()
+
+ if c.Status != Paused {
+ return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
+ }
+ if err := c.Sandbox.Resume(c.ID); err != nil {
+ return fmt.Errorf("resuming container: %v", err)
+ }
+ c.changeStatus(Running)
+ return c.save()
+}
+
+// State returns the metadata of the container.
+func (c *Container) State() specs.State {
+ return specs.State{
+ Version: specs.Version,
+ ID: c.ID,
+ Status: c.Status.String(),
+ Pid: c.SandboxPid(),
+ Bundle: c.BundleDir,
+ }
+}
+
+// Processes retrieves the list of processes and associated metadata inside a
+// container.
+func (c *Container) Processes() ([]*control.Process, error) {
+ if err := c.requireStatus("get processes of", Running, Paused); err != nil {
+ return nil, err
+ }
+ return c.Sandbox.Processes(c.ID)
+}
+
+// Destroy stops all processes and frees all resources associated with the
+// container.
+func (c *Container) Destroy() error {
+ log.Debugf("Destroy container %q", c.ID)
+
+ // We must perform the following cleanup steps:
+ // * stop the container and gofer processes,
+ // * remove the container filesystem on the host, and
+ // * delete the container metadata directory.
+ //
+ // It's possible for one or more of these steps to fail, but we should
+ // do our best to perform all of the cleanups. Hence, we keep a slice
+ // of errors return their concatenation.
+ var errs []string
+
+ unlock, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
+ if err != nil {
+ return err
+ }
+ defer unlock()
+
+ if err := c.stop(); err != nil {
+ err = fmt.Errorf("stopping container: %v", err)
+ log.Warningf("%v", err)
+ errs = append(errs, err.Error())
+ }
+
+ if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
+ err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err)
+ log.Warningf("%v", err)
+ errs = append(errs, err.Error())
+ }
+
+ c.changeStatus(Stopped)
+
+ // "If any poststop hook fails, the runtime MUST log a warning, but the
+ // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
+ // Based on the OCI, "The post-stop hooks MUST be called after the container is
+ // deleted but before the delete operation returns"
+ // Run it here to:
+ // 1) Conform to the OCI.
+ // 2) Make sure it only runs once, because the root has been deleted, the container
+ // can't be loaded again.
+ if c.Spec.Hooks != nil {
+ executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
+ }
+
+ if len(errs) == 0 {
+ return nil
+ }
+ return fmt.Errorf(strings.Join(errs, "\n"))
+}
+
+// save saves the container metadata to a file.
+//
+// Precondition: container must be locked with container.lock().
+func (c *Container) save() error {
+ log.Debugf("Save container %q", c.ID)
+ metaFile := filepath.Join(c.Root, metadataFilename)
+ meta, err := json.Marshal(c)
+ if err != nil {
+ return fmt.Errorf("invalid container metadata: %v", err)
+ }
+ if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
+ return fmt.Errorf("writing container metadata: %v", err)
+ }
+ return nil
+}
+
+// stop stops the container (for regular containers) or the sandbox (for
+// root containers), and waits for the container or sandbox and the gofer
+// to stop. If any of them doesn't stop before timeout, an error is returned.
+func (c *Container) stop() error {
+ var cgroup *cgroup.Cgroup
+
+ if c.Sandbox != nil {
+ log.Debugf("Destroying container %q", c.ID)
+ if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
+ return fmt.Errorf("destroying container %q: %v", c.ID, err)
+ }
+ // Only uninstall cgroup for sandbox stop.
+ if c.Sandbox.IsRootContainer(c.ID) {
+ cgroup = c.Sandbox.Cgroup
+ }
+ // Only set sandbox to nil after it has been told to destroy the container.
+ c.Sandbox = nil
+ }
+
+ // Try killing gofer if it does not exit with container.
+ if c.GoferPid != 0 {
+ log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
+ if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+ // The gofer may already be stopped, log the error.
+ log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err)
+ }
+ }
+
+ if err := c.waitForStopped(); err != nil {
+ return err
+ }
+
+ // Gofer is running in cgroups, so Cgroup.Uninstall has to be called after it.
+ if cgroup != nil {
+ if err := cgroup.Uninstall(); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (c *Container) waitForStopped() error {
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+ op := func() error {
+ if c.isSandboxRunning() {
+ if err := c.SignalContainer(syscall.Signal(0), false); err == nil {
+ return fmt.Errorf("container is still running")
+ }
+ }
+ if c.GoferPid == 0 {
+ return nil
+ }
+ if c.goferIsChild {
+ // The gofer process is a child of the current process,
+ // so we can wait it and collect its zombie.
+ wpid, err := syscall.Wait4(int(c.GoferPid), nil, syscall.WNOHANG, nil)
+ if err != nil {
+ return fmt.Errorf("error waiting the gofer process: %v", err)
+ }
+ if wpid == 0 {
+ return fmt.Errorf("gofer is still running")
+ }
+
+ } else if err := syscall.Kill(c.GoferPid, 0); err == nil {
+ return fmt.Errorf("gofer is still running")
+ }
+ c.GoferPid = 0
+ return nil
+ }
+ return backoff.Retry(op, b)
+}
+
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) {
+ // Start with the general config flags.
+ args := conf.ToFlags()
+
+ var goferEnds []*os.File
+
+ // nextFD is the next available file descriptor for the gofer process.
+ // It starts at 3 because 0-2 are used by stdin/stdout/stderr.
+ nextFD := 3
+
+ if conf.LogFilename != "" {
+ logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+ if err != nil {
+ return nil, nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
+ }
+ defer logFile.Close()
+ goferEnds = append(goferEnds, logFile)
+ args = append(args, "--log-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ if conf.DebugLog != "" {
+ debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer")
+ if err != nil {
+ return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
+ }
+ defer debugLogFile.Close()
+ goferEnds = append(goferEnds, debugLogFile)
+ args = append(args, "--debug-log-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ args = append(args, "gofer", "--bundle", bundleDir)
+ if conf.Overlay {
+ args = append(args, "--panic-on-write=true")
+ }
+
+ // Open the spec file to donate to the sandbox.
+ specFile, err := specutils.OpenSpec(bundleDir)
+ if err != nil {
+ return nil, nil, fmt.Errorf("opening spec file: %v", err)
+ }
+ defer specFile.Close()
+ goferEnds = append(goferEnds, specFile)
+ args = append(args, "--spec-fd="+strconv.Itoa(nextFD))
+ nextFD++
+
+ // Create pipe that allows gofer to send mount list to sandbox after all paths
+ // have been resolved.
+ mountsSand, mountsGofer, err := os.Pipe()
+ if err != nil {
+ return nil, nil, err
+ }
+ defer mountsGofer.Close()
+ goferEnds = append(goferEnds, mountsGofer)
+ args = append(args, fmt.Sprintf("--mounts-fd=%d", nextFD))
+ nextFD++
+
+ // Add root mount and then add any other additional mounts.
+ mountCount := 1
+ for _, m := range spec.Mounts {
+ if specutils.Is9PMount(m) {
+ mountCount++
+ }
+ }
+
+ sandEnds := make([]*os.File, 0, mountCount)
+ for i := 0; i < mountCount; i++ {
+ fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ return nil, nil, err
+ }
+ sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD"))
+
+ goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD")
+ defer goferEnd.Close()
+ goferEnds = append(goferEnds, goferEnd)
+
+ args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
+ nextFD++
+ }
+
+ binPath := specutils.ExePath
+ cmd := exec.Command(binPath, args...)
+ cmd.ExtraFiles = goferEnds
+ cmd.Args[0] = "runsc-gofer"
+
+ // Enter new namespaces to isolate from the rest of the system. Don't unshare
+ // cgroup because gofer is added to a cgroup in the caller's namespace.
+ nss := []specs.LinuxNamespace{
+ {Type: specs.IPCNamespace},
+ {Type: specs.MountNamespace},
+ {Type: specs.NetworkNamespace},
+ {Type: specs.PIDNamespace},
+ {Type: specs.UTSNamespace},
+ }
+
+ // Setup any uid/gid mappings, and create or join the configured user
+ // namespace so the gofer's view of the filesystem aligns with the
+ // users in the sandbox.
+ userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
+ nss = append(nss, userNS...)
+ specutils.SetUIDGIDMappings(cmd, spec)
+ if len(userNS) != 0 {
+ // We need to set UID and GID to have capabilities in a new user namespace.
+ cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
+ }
+
+ // Start the gofer in the given namespace.
+ log.Debugf("Starting gofer: %s %v", binPath, args)
+ if err := specutils.StartInNS(cmd, nss); err != nil {
+ return nil, nil, fmt.Errorf("Gofer: %v", err)
+ }
+ log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
+ c.GoferPid = cmd.Process.Pid
+ c.goferIsChild = true
+ return sandEnds, mountsSand, nil
+}
+
+// changeStatus transitions from one status to another ensuring that the
+// transition is valid.
+func (c *Container) changeStatus(s Status) {
+ switch s {
+ case Creating:
+ // Initial state, never transitions to it.
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+
+ case Created:
+ if c.Status != Creating {
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+ }
+ if c.Sandbox == nil {
+ panic("sandbox cannot be nil")
+ }
+
+ case Paused:
+ if c.Status != Running {
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+ }
+ if c.Sandbox == nil {
+ panic("sandbox cannot be nil")
+ }
+
+ case Running:
+ if c.Status != Created && c.Status != Paused {
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+ }
+ if c.Sandbox == nil {
+ panic("sandbox cannot be nil")
+ }
+
+ case Stopped:
+ if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped {
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+ }
+
+ default:
+ panic(fmt.Sprintf("invalid new state: %v", s))
+ }
+ c.Status = s
+}
+
+func (c *Container) isSandboxRunning() bool {
+ return c.Sandbox != nil && c.Sandbox.IsRunning()
+}
+
+func (c *Container) requireStatus(action string, statuses ...Status) error {
+ for _, s := range statuses {
+ if c.Status == s {
+ return nil
+ }
+ }
+ return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
+}
+
+// lock takes a file lock on the container metadata lock file.
+func (c *Container) lock() (func() error, error) {
+ return lockContainerMetadata(filepath.Join(c.Root, c.ID))
+}
+
+// lockContainerMetadata takes a file lock on the metadata lock file in the
+// given container root directory.
+func lockContainerMetadata(containerRootDir string) (func() error, error) {
+ if err := os.MkdirAll(containerRootDir, 0711); err != nil {
+ return nil, fmt.Errorf("creating container root directory %q: %v", containerRootDir, err)
+ }
+ f := filepath.Join(containerRootDir, metadataLockFilename)
+ l := flock.NewFlock(f)
+ if err := l.Lock(); err != nil {
+ return nil, fmt.Errorf("acquiring lock on container lock file %q: %v", f, err)
+ }
+ return l.Unlock, nil
+}
+
+// maybeLockRootContainer locks the sandbox root container. It is used to
+// prevent races to create and delete child container sandboxes.
+func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, error) {
+ if isRoot(spec) {
+ return func() error { return nil }, nil
+ }
+
+ sbid, ok := specutils.SandboxID(spec)
+ if !ok {
+ return nil, fmt.Errorf("no sandbox ID found when locking root container")
+ }
+ sb, err := Load(rootDir, sbid)
+ if err != nil {
+ return nil, err
+ }
+
+ unlock, err := sb.lock()
+ if err != nil {
+ return nil, err
+ }
+ return unlock, nil
+}
+
+func isRoot(spec *specs.Spec) bool {
+ return specutils.ShouldCreateSandbox(spec)
+}
+
+// runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
+// it in the current context.
+func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
+ if cg == nil {
+ return fn()
+ }
+ restore, err := cg.Join()
+ defer restore()
+ if err != nil {
+ return err
+ }
+ return fn()
+}
diff --git a/runsc/container/hook.go b/runsc/container/hook.go
new file mode 100644
index 000000000..acae6781e
--- /dev/null
+++ b/runsc/container/hook.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "os/exec"
+ "path/filepath"
+ "strings"
+ "time"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// This file implements hooks as defined in OCI spec:
+// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22
+//
+// "hooks":{
+// "prestart":[{
+// "path":"/usr/bin/dockerd",
+// "args":[
+// "libnetwork-setkey", "arg2",
+// ]
+// }]
+// },
+
+// executeHooksBestEffort executes hooks and logs warning in case they fail.
+// Runs all hooks, always.
+func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
+ for _, h := range hooks {
+ if err := executeHook(h, s); err != nil {
+ log.Warningf("Failure to execute hook %+v, err: %v", h, err)
+ }
+ }
+}
+
+// executeHooks executes hooks until the first one fails or they all execute.
+func executeHooks(hooks []specs.Hook, s specs.State) error {
+ for _, h := range hooks {
+ if err := executeHook(h, s); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func executeHook(h specs.Hook, s specs.State) error {
+ log.Debugf("Executing hook %+v, state: %+v", h, s)
+
+ if strings.TrimSpace(h.Path) == "" {
+ return fmt.Errorf("empty path for hook")
+ }
+ if !filepath.IsAbs(h.Path) {
+ return fmt.Errorf("path for hook is not absolute: %q", h.Path)
+ }
+
+ b, err := json.Marshal(s)
+ if err != nil {
+ return err
+ }
+ var stdout, stderr bytes.Buffer
+ cmd := exec.Cmd{
+ Path: h.Path,
+ Args: h.Args,
+ Env: h.Env,
+ Stdin: bytes.NewReader(b),
+ Stdout: &stdout,
+ Stderr: &stderr,
+ }
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+
+ c := make(chan error, 1)
+ go func() {
+ c <- cmd.Wait()
+ }()
+
+ var timer <-chan time.Time
+ if h.Timeout != nil {
+ timer = time.After(time.Duration(*h.Timeout) * time.Second)
+ }
+ select {
+ case err := <-c:
+ if err != nil {
+ return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String())
+ }
+ case <-timer:
+ cmd.Process.Kill()
+ cmd.Wait()
+ return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String())
+ }
+
+ log.Debugf("Execute hook %q success!", h.Path)
+ return nil
+}
diff --git a/runsc/container/status.go b/runsc/container/status.go
new file mode 100644
index 000000000..91d9112f1
--- /dev/null
+++ b/runsc/container/status.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+// Status enumerates container statuses. The statuses and their semantics are
+// part of the runtime CLI spec.
+type Status int
+
+const (
+ // Created indicates "the runtime has finished the create operation and
+ // the container process has neither exited nor executed the
+ // user-specified program".
+ Created Status = iota
+
+ // Creating indicates "the container is being created".
+ Creating
+
+ // Paused indicates that the process within the container has been
+ // suspended.
+ Paused
+
+ // Running indicates "the container process has executed the
+ // user-specified program but has not exited".
+ Running
+
+ // Stopped indicates "the container process has exited".
+ Stopped
+)
+
+// String converts a Status to a string. These strings are part of the runtime
+// CLI spec and should not be changed.
+func (s Status) String() string {
+ switch s {
+ case Created:
+ return "created"
+ case Creating:
+ return "creating"
+ case Paused:
+ return "paused"
+ case Running:
+ return "running"
+ case Stopped:
+ return "stopped"
+ default:
+ return "unknown"
+ }
+
+}
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
new file mode 100644
index 000000000..4faab2946
--- /dev/null
+++ b/runsc/fsgofer/filter/config.go
@@ -0,0 +1,182 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+ "os"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// allowedSyscalls is the set of syscalls executed by the gofer.
+var allowedSyscalls = seccomp.SyscallRules{
+ syscall.SYS_ACCEPT: {},
+ syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(linux.ARCH_GET_FS)},
+ {seccomp.AllowValue(linux.ARCH_SET_FS)},
+ },
+ syscall.SYS_CLOCK_GETTIME: {},
+ syscall.SYS_CLONE: []seccomp.Rule{
+ {
+ seccomp.AllowValue(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ },
+ },
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
+ syscall.SYS_EPOLL_CTL: {},
+ syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_EVENTFD2: []seccomp.Rule{
+ {
+ seccomp.AllowValue(0),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_EXIT: {},
+ syscall.SYS_EXIT_GROUP: {},
+ syscall.SYS_FALLOCATE: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_FCHMOD: {},
+ syscall.SYS_FCHOWNAT: {},
+ syscall.SYS_FCNTL: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_GETFL),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_SETFL),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_GETFD),
+ },
+ },
+ syscall.SYS_FSTAT: {},
+ syscall.SYS_FSTATFS: {},
+ syscall.SYS_FSYNC: {},
+ syscall.SYS_FTRUNCATE: {},
+ syscall.SYS_FUTEX: {
+ seccomp.Rule{
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ seccomp.Rule{
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_GETDENTS64: {},
+ syscall.SYS_GETPID: {},
+ unix.SYS_GETRANDOM: {},
+ syscall.SYS_GETTID: {},
+ syscall.SYS_GETTIMEOFDAY: {},
+ syscall.SYS_LINKAT: {},
+ syscall.SYS_LSEEK: {},
+ syscall.SYS_MADVISE: {},
+ syscall.SYS_MKDIRAT: {},
+ syscall.SYS_MMAP: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_SHARED),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+ },
+ },
+ syscall.SYS_MPROTECT: {},
+ syscall.SYS_MUNMAP: {},
+ syscall.SYS_NANOSLEEP: {},
+ syscall.SYS_NEWFSTATAT: {},
+ syscall.SYS_OPENAT: {},
+ syscall.SYS_POLL: {},
+ syscall.SYS_PREAD64: {},
+ syscall.SYS_PWRITE64: {},
+ syscall.SYS_READ: {},
+ syscall.SYS_READLINKAT: {},
+ syscall.SYS_RECVMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+ },
+ },
+ syscall.SYS_RENAMEAT: {},
+ syscall.SYS_RESTART_SYSCALL: {},
+ syscall.SYS_RT_SIGPROCMASK: {},
+ syscall.SYS_SCHED_YIELD: {},
+ syscall.SYS_SENDMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+ },
+ },
+ syscall.SYS_SHUTDOWN: []seccomp.Rule{
+ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+ },
+ syscall.SYS_SIGALTSTACK: {},
+ syscall.SYS_SYMLINKAT: {},
+ syscall.SYS_TGKILL: []seccomp.Rule{
+ {
+ seccomp.AllowValue(uint64(os.Getpid())),
+ },
+ },
+ syscall.SYS_UNLINKAT: {},
+ syscall.SYS_UTIMENSAT: {},
+ syscall.SYS_WRITE: {},
+}
diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go
new file mode 100644
index 000000000..5c5ec4e06
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go intrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() seccomp.SyscallRules {
+ return nil
+}
diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go
new file mode 100644
index 000000000..553060bc3
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters_msan.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+ log.Warningf("*** SECCOMP WARNING: MSAN is enabled: syscall filters less restrictive!")
+ return seccomp.SyscallRules{
+ syscall.SYS_SCHED_GETAFFINITY: {},
+ syscall.SYS_SET_ROBUST_LIST: {},
+ }
+}
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
new file mode 100644
index 000000000..28555f898
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+ log.Warningf("*** SECCOMP WARNING: TSAN is enabled: syscall filters less restrictive!")
+ return seccomp.SyscallRules{
+ syscall.SYS_BRK: {},
+ syscall.SYS_CLONE: {},
+ syscall.SYS_FUTEX: {},
+ syscall.SYS_MADVISE: {},
+ syscall.SYS_MMAP: {},
+ syscall.SYS_MUNLOCK: {},
+ syscall.SYS_NANOSLEEP: {},
+ syscall.SYS_OPEN: {},
+ syscall.SYS_SET_ROBUST_LIST: {},
+ // Used within glibc's malloc.
+ syscall.SYS_TIME: {},
+ }
+}
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
new file mode 100644
index 000000000..ff8154369
--- /dev/null
+++ b/runsc/fsgofer/filter/filter.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the gofer is allowed to make, and
+// installs seccomp filters to prevent prohibited syscalls in case it's
+// compromised.
+package filter
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// Install installs seccomp filters.
+func Install() error {
+ s := allowedSyscalls
+
+ // Set of additional filters used by -race and -msan. Returns empty
+ // when not enabled.
+ s.Merge(instrumentationFilters())
+
+ return seccomp.Install(s)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
new file mode 100644
index 000000000..2cf50290a
--- /dev/null
+++ b/runsc/fsgofer/fsgofer.go
@@ -0,0 +1,1057 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsgofer implements p9.File giving access to local files using
+// a simple mapping from a path prefix that is added to the path requested
+// by the sandbox. Ex:
+//
+// prefix: "/docker/imgs/alpine"
+// app path: /bin/ls => /docker/imgs/alpine/bin/ls
+package fsgofer
+
+import (
+ "fmt"
+ "io"
+ "math"
+ "os"
+ "path"
+ "path/filepath"
+ "runtime"
+ "sync"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/p9"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ // invalidMode is set to a value that doesn't match any other valid
+ // modes to ensure an unopened/closed file fails all mode checks.
+ invalidMode = p9.OpenFlags(math.MaxUint32)
+
+ openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
+)
+
+type fileType int
+
+const (
+ regular fileType = iota
+ directory
+ symlink
+ unknown
+)
+
+// String implements fmt.Stringer.
+func (f fileType) String() string {
+ switch f {
+ case regular:
+ return "regular"
+ case directory:
+ return "directory"
+ case symlink:
+ return "symlink"
+ }
+ return "unknown"
+}
+
+// ControlSocketAddr generates an abstract unix socket name for the given id.
+func ControlSocketAddr(id string) string {
+ return fmt.Sprintf("\x00runsc-gofer.%s", id)
+}
+
+// Config sets configuration options for each attach point.
+type Config struct {
+ // ROMount is set to true if this is a readonly mount.
+ ROMount bool
+
+ // PanicOnWrite panics on attempts to write to RO mounts.
+ PanicOnWrite bool
+}
+
+type attachPoint struct {
+ prefix string
+ conf Config
+
+ // attachedMu protects attached.
+ attachedMu sync.Mutex
+ attached bool
+
+ // deviceMu protects devices and nextDevice.
+ deviceMu sync.Mutex
+
+ // nextDevice is the next device id that will be allocated.
+ nextDevice uint8
+
+ // devices is a map from actual host devices to "small" integers that
+ // can be combined with host inode to form a unique virtual inode id.
+ devices map[uint64]uint8
+}
+
+// NewAttachPoint creates a new attacher that gives local file
+// access to all files under 'prefix'. 'prefix' must be an absolute path.
+func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) {
+ // Sanity check the prefix.
+ if !filepath.IsAbs(prefix) {
+ return nil, fmt.Errorf("attach point prefix must be absolute %q", prefix)
+ }
+ return &attachPoint{
+ prefix: prefix,
+ conf: c,
+ devices: make(map[uint64]uint8),
+ }, nil
+}
+
+// Attach implements p9.Attacher.
+func (a *attachPoint) Attach() (p9.File, error) {
+ // dirFD (1st argument) is ignored because 'prefix' is always absolute.
+ stat, err := statAt(-1, a.prefix)
+ if err != nil {
+ return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err)
+ }
+ mode := syscall.O_RDWR
+ if a.conf.ROMount || stat.Mode&syscall.S_IFDIR != 0 {
+ mode = syscall.O_RDONLY
+ }
+
+ // Open the root directory.
+ f, err := fd.Open(a.prefix, openFlags|mode, 0)
+ if err != nil {
+ return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err)
+ }
+
+ a.attachedMu.Lock()
+ defer a.attachedMu.Unlock()
+ if a.attached {
+ f.Close()
+ return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
+ }
+ a.attached = true
+
+ return newLocalFile(a, f, a.prefix, stat)
+}
+
+// makeQID returns a unique QID for the given stat buffer.
+func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
+ a.deviceMu.Lock()
+ defer a.deviceMu.Unlock()
+
+ // First map the host device id to a unique 8-bit integer.
+ dev, ok := a.devices[stat.Dev]
+ if !ok {
+ a.devices[stat.Dev] = a.nextDevice
+ dev = a.nextDevice
+ a.nextDevice++
+ if a.nextDevice < dev {
+ panic(fmt.Sprintf("device id overflow! map: %+v", a.devices))
+ }
+ }
+
+ // Construct a "virtual" inode id with the uint8 device number in the
+ // first 8 bits, and the rest of the bits from the host inode id.
+ maskedIno := stat.Ino & 0x00ffffffffffffff
+ if maskedIno != stat.Ino {
+ log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino)
+ }
+ ino := uint64(dev)<<56 | maskedIno
+ log.Debugf("host inode %x on device %x mapped to virtual inode %x", stat.Ino, stat.Dev, ino)
+
+ return p9.QID{
+ Type: p9.FileMode(stat.Mode).QIDType(),
+ Path: ino,
+ }
+}
+
+// localFile implements p9.File wrapping a local file. The underlying file
+// is opened during Walk() and stored in 'file' to be used with other
+// operations. The file is opened as readonly, unless it's a symlink or there is
+// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is
+// called to clone the file. This reduces the number of walks that need to be
+// done by the host file system when files are reused.
+//
+// The file may be reopened if the requested mode in Open() is not a subset of
+// current mode. Consequently, 'file' could have a mode wider than requested and
+// must be verified before read/write operations. Before the file is opened and
+// after it's closed, 'mode' is set to an invalid value to prevent an unopened
+// file from being used.
+//
+// The reason that the file is not opened initially as read-write is for better
+// performance with 'overlay2' storage driver. overlay2 eagerly copies the
+// entire file up when it's opened in write mode, and would perform badly when
+type localFile struct {
+ p9.DefaultWalkGetAttr
+
+ // attachPoint is the attachPoint that serves this localFile.
+ attachPoint *attachPoint
+
+ // hostPath will be safely updated by the Renamed hook.
+ hostPath string
+
+ // file is opened when localFile is created and it's never nil. It may be
+ // reopened if the Open() mode is wider than the mode the file was originally
+ // opened with.
+ file *fd.FD
+
+ // mode is the mode in which the file was opened. Set to invalidMode
+ // if localFile isn't opened.
+ mode p9.OpenFlags
+
+ // ft is the fileType for this file.
+ ft fileType
+
+ // readDirMu protects against concurrent Readdir calls.
+ readDirMu sync.Mutex
+
+ // lastDirentOffset is the last offset returned by Readdir(). If another call
+ // to Readdir is made at the same offset, the file doesn't need to be
+ // repositioned. This is an important optimization because the caller must
+ // always make one extra call to detect EOF (empty result, no error).
+ lastDirentOffset uint64
+}
+
+func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) {
+ path := path.Join(parent.hostPath, name)
+ f, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+ return fd.OpenAt(parent.file, name, openFlags|mode, 0)
+ })
+ return f, path, err
+}
+
+// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
+// to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
+// actual file open and is customizable by the caller.
+func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
+ // Attempt to open file in the following mode in order:
+ // 1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too.
+ // Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option
+ // has no effect on regular files.
+ // 2. PATH: for symlinks
+ modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
+
+ var err error
+ var file *fd.FD
+ for i, mode := range modes {
+ file, err = fn(mode)
+ if err == nil {
+ // openat succeeded, we're done.
+ break
+ }
+ switch e := extractErrno(err); e {
+ case syscall.ENOENT:
+ // File doesn't exist, no point in retrying.
+ return nil, e
+ }
+ // openat failed. Try again with next mode, preserving 'err' in case this
+ // was the last attempt.
+ log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err)
+ }
+ if err != nil {
+ // All attempts to open file have failed, return the last error.
+ log.Debugf("Failed to open file, path: %q, err: %v", path, err)
+ return nil, extractErrno(err)
+ }
+
+ return file, nil
+}
+
+func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
+ var ft fileType
+ switch stat.Mode & syscall.S_IFMT {
+ case syscall.S_IFREG:
+ ft = regular
+ case syscall.S_IFDIR:
+ ft = directory
+ case syscall.S_IFLNK:
+ ft = symlink
+ default:
+ return unknown, syscall.EPERM
+ }
+ return ft, nil
+}
+
+func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
+ ft, err := getSupportedFileType(stat)
+ if err != nil {
+ return nil, err
+ }
+
+ return &localFile{
+ attachPoint: a,
+ hostPath: path,
+ file: file,
+ mode: invalidMode,
+ ft: ft,
+ }, nil
+}
+
+// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as
+// non-blocking. If anything fails, returns nil. It's better to have a file
+// without host FD, than to fail the operation.
+func newFDMaybe(file *fd.FD) *fd.FD {
+ dupFD, err := syscall.Dup(file.FD())
+ // Technically, the runtime may call the finalizer on file as soon as
+ // FD() returns.
+ runtime.KeepAlive(file)
+ if err != nil {
+ return nil
+ }
+ dup := fd.New(dupFD)
+
+ // fd is blocking; non-blocking is required.
+ if err := syscall.SetNonblock(dup.FD(), true); err != nil {
+ dup.Close()
+ return nil
+ }
+ return dup
+}
+
+func stat(fd int) (syscall.Stat_t, error) {
+ var stat syscall.Stat_t
+ if err := syscall.Fstat(fd, &stat); err != nil {
+ return syscall.Stat_t{}, err
+ }
+ return stat, nil
+}
+
+func fchown(fd int, uid p9.UID, gid p9.GID) error {
+ return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+}
+
+// Open implements p9.File.
+func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+ if l.isOpen() {
+ panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
+ }
+
+ // Check if control file can be used or if a new open must be created.
+ var newFile *fd.FD
+ if mode == p9.ReadOnly {
+ log.Debugf("Open reusing control file, mode: %v, %q", mode, l.hostPath)
+ newFile = l.file
+ } else {
+ // Ideally reopen would call name_to_handle_at (with empty name) and
+ // open_by_handle_at to reopen the file without using 'hostPath'. However,
+ // name_to_handle_at and open_by_handle_at aren't supported by overlay2.
+ log.Debugf("Open reopening file, mode: %v, %q", mode, l.hostPath)
+ var err error
+ newFile, err = fd.Open(l.hostPath, openFlags|mode.OSFlags(), 0)
+ if err != nil {
+ return nil, p9.QID{}, 0, extractErrno(err)
+ }
+ }
+
+ stat, err := stat(newFile.FD())
+ if err != nil {
+ if newFile != l.file {
+ newFile.Close()
+ }
+ return nil, p9.QID{}, 0, extractErrno(err)
+ }
+
+ var fd *fd.FD
+ if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+ // Donate FD for regular files only.
+ fd = newFDMaybe(newFile)
+ }
+
+ // Close old file in case a new one was created.
+ if newFile != l.file {
+ if err := l.file.Close(); err != nil {
+ log.Warningf("Error closing file %q: %v", l.hostPath, err)
+ }
+ l.file = newFile
+ }
+ l.mode = mode
+ return fd, l.attachPoint.makeQID(stat), 0, nil
+}
+
+// Create implements p9.File.
+func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return nil, nil, p9.QID{}, 0, syscall.EBADF
+ }
+
+ // 'file' may be used for other operations (e.g. Walk), so read access is
+ // always added to flags. Note that resulting file might have a wider mode
+ // than needed for each particular case.
+ flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+ if mode == p9.WriteOnly {
+ flags |= syscall.O_RDWR
+ } else {
+ flags |= mode.OSFlags()
+ }
+
+ child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions()))
+ if err != nil {
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+ cu := specutils.MakeCleanup(func() {
+ child.Close()
+ // Best effort attempt to remove the file in case of failure.
+ if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
+ }
+ })
+ defer cu.Clean()
+
+ if err := fchown(child.FD(), uid, gid); err != nil {
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+ stat, err := stat(child.FD())
+ if err != nil {
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+
+ c := &localFile{
+ attachPoint: l.attachPoint,
+ hostPath: path.Join(l.hostPath, name),
+ file: child,
+ mode: mode,
+ }
+
+ cu.Release()
+ return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
+}
+
+// Mkdir implements p9.File.
+func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return p9.QID{}, syscall.EBADF
+ }
+
+ if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ cu := specutils.MakeCleanup(func() {
+ // Best effort attempt to remove the dir in case of failure.
+ if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil {
+ log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
+ }
+ })
+ defer cu.Clean()
+
+ // Open directory to change ownership and stat it.
+ flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+ f, err := fd.OpenAt(l.file, name, flags, 0)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ defer f.Close()
+
+ if err := fchown(f.FD(), uid, gid); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ stat, err := stat(f.FD())
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+
+ cu.Release()
+ return l.attachPoint.makeQID(stat), nil
+}
+
+// Walk implements p9.File.
+func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+ // Duplicate current file if 'names' is empty.
+ if len(names) == 0 {
+ newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
+ return fd.Open(l.hostPath, openFlags|mode, 0)
+ })
+ if err != nil {
+ return nil, nil, extractErrno(err)
+ }
+
+ stat, err := stat(newFile.FD())
+ if err != nil {
+ newFile.Close()
+ return nil, nil, extractErrno(err)
+ }
+
+ c := &localFile{
+ attachPoint: l.attachPoint,
+ hostPath: l.hostPath,
+ file: newFile,
+ mode: invalidMode,
+ }
+ return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
+ }
+
+ var qids []p9.QID
+ last := l
+ for _, name := range names {
+ f, path, err := openAnyFileFromParent(last, name)
+ if last != l {
+ last.Close()
+ }
+ if err != nil {
+ return nil, nil, extractErrno(err)
+ }
+ stat, err := stat(f.FD())
+ if err != nil {
+ f.Close()
+ return nil, nil, extractErrno(err)
+ }
+ c, err := newLocalFile(last.attachPoint, f, path, stat)
+ if err != nil {
+ f.Close()
+ return nil, nil, extractErrno(err)
+ }
+
+ qids = append(qids, l.attachPoint.makeQID(stat))
+ last = c
+ }
+ return qids, last, nil
+}
+
+// StatFS implements p9.File.
+func (l *localFile) StatFS() (p9.FSStat, error) {
+ var s syscall.Statfs_t
+ if err := syscall.Fstatfs(l.file.FD(), &s); err != nil {
+ return p9.FSStat{}, extractErrno(err)
+ }
+
+ // Populate with what's available.
+ return p9.FSStat{
+ Type: uint32(s.Type),
+ BlockSize: uint32(s.Bsize),
+ Blocks: s.Blocks,
+ BlocksFree: s.Bfree,
+ BlocksAvailable: s.Bavail,
+ Files: s.Files,
+ FilesFree: s.Ffree,
+ NameLength: uint32(s.Namelen),
+ }, nil
+}
+
+// FSync implements p9.File.
+func (l *localFile) FSync() error {
+ if !l.isOpen() {
+ return syscall.EBADF
+ }
+ if err := syscall.Fsync(l.file.FD()); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// GetAttr implements p9.File.
+func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+ stat, err := stat(l.file.FD())
+ if err != nil {
+ return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
+ }
+
+ attr := p9.Attr{
+ Mode: p9.FileMode(stat.Mode),
+ UID: p9.UID(stat.Uid),
+ GID: p9.GID(stat.Gid),
+ NLink: stat.Nlink,
+ RDev: stat.Rdev,
+ Size: uint64(stat.Size),
+ BlockSize: uint64(stat.Blksize),
+ Blocks: uint64(stat.Blocks),
+ ATimeSeconds: uint64(stat.Atim.Sec),
+ ATimeNanoSeconds: uint64(stat.Atim.Nsec),
+ MTimeSeconds: uint64(stat.Mtim.Sec),
+ MTimeNanoSeconds: uint64(stat.Mtim.Nsec),
+ CTimeSeconds: uint64(stat.Ctim.Sec),
+ CTimeNanoSeconds: uint64(stat.Ctim.Nsec),
+ }
+ valid := p9.AttrMask{
+ Mode: true,
+ UID: true,
+ GID: true,
+ NLink: true,
+ RDev: true,
+ Size: true,
+ Blocks: true,
+ ATime: true,
+ MTime: true,
+ CTime: true,
+ }
+
+ return l.attachPoint.makeQID(stat), valid, attr, nil
+}
+
+// SetAttr implements p9.File. Due to mismatch in file API, options
+// cannot be changed atomicaly and user may see partial changes when
+// an error happens.
+func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return syscall.EBADF
+ }
+
+ allowed := p9.SetAttrMask{
+ Permissions: true,
+ UID: true,
+ GID: true,
+ Size: true,
+ ATime: true,
+ MTime: true,
+ ATimeNotSystemTime: true,
+ MTimeNotSystemTime: true,
+ }
+
+ if valid.Empty() {
+ // Nothing to do.
+ return nil
+ }
+
+ // Handle all the sanity checks up front so that the client gets a
+ // consistent result that is not attribute dependent.
+ if !valid.IsSubsetOf(allowed) {
+ log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid)
+ return syscall.EPERM
+ }
+
+ // Check if it's possible to use cached file, or if another one needs to be
+ // opened for write.
+ f := l.file
+ if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+ var err error
+ f, err = fd.Open(l.hostPath, openFlags|syscall.O_WRONLY, 0)
+ if err != nil {
+ return extractErrno(err)
+ }
+ defer f.Close()
+ }
+
+ // The semantics are to either return an error if no changes were made,
+ // or no error if *all* changes were made. Well, this can be impossible
+ // if the filesystem rejects at least one of the changes, especially
+ // since some operations are not easy to undo atomically.
+ //
+ // This could be made better if SetAttr actually returned the changes
+ // it did make, so the client can at least know what has changed. So
+ // we at least attempt to make all of the changes and return a generic
+ // error if any of them fails, which at least doesn't bias any change
+ // over another.
+ var err error
+ if valid.Permissions {
+ if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
+ log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
+ err = extractErrno(cerr)
+ }
+ }
+
+ if valid.Size {
+ if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
+ log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ }
+
+ if valid.ATime || valid.MTime {
+ utimes := [2]syscall.Timespec{
+ {Sec: 0, Nsec: linux.UTIME_OMIT},
+ {Sec: 0, Nsec: linux.UTIME_OMIT},
+ }
+ if valid.ATime {
+ if valid.ATimeNotSystemTime {
+ utimes[0].Sec = int64(attr.ATimeSeconds)
+ utimes[0].Nsec = int64(attr.ATimeNanoSeconds)
+ } else {
+ utimes[0].Nsec = linux.UTIME_NOW
+ }
+ }
+ if valid.MTime {
+ if valid.MTimeNotSystemTime {
+ utimes[1].Sec = int64(attr.MTimeSeconds)
+ utimes[1].Nsec = int64(attr.MTimeNanoSeconds)
+ } else {
+ utimes[1].Nsec = linux.UTIME_NOW
+ }
+ }
+
+ if l.ft == symlink {
+ // utimensat operates different that other syscalls. To operate on a
+ // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
+ // name.
+ parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+ if err != nil {
+ return extractErrno(err)
+ }
+ defer syscall.Close(parent)
+
+ if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
+ log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ } else {
+ // Directories and regular files can operate directly on the fd
+ // using empty name.
+ if terr := utimensat(f.FD(), "", utimes, 0); terr != nil {
+ log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ }
+ }
+
+ if valid.UID || valid.GID {
+ uid := -1
+ if valid.UID {
+ uid = int(attr.UID)
+ }
+ gid := -1
+ if valid.GID {
+ gid = int(attr.GID)
+ }
+ if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+ log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
+ err = extractErrno(oerr)
+ }
+ }
+
+ return err
+}
+
+// Allocate implements p9.File.
+func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
+ if !l.isOpen() {
+ return syscall.EBADF
+ }
+
+ if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// Rename implements p9.File; this should never be called.
+func (l *localFile) Rename(p9.File, string) error {
+ panic("rename called directly")
+}
+
+// RenameAt implements p9.File.RenameAt.
+func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return syscall.EBADF
+ }
+
+ newParent := directory.(*localFile)
+ if err := renameat(l.file.FD(), oldName, newParent.file.FD(), newName); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// ReadAt implements p9.File.
+func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
+ if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+ return 0, syscall.EBADF
+ }
+ if !l.isOpen() {
+ return 0, syscall.EBADF
+ }
+
+ r, err := l.file.ReadAt(p, int64(offset))
+ switch err {
+ case nil, io.EOF:
+ return r, nil
+ default:
+ return r, extractErrno(err)
+ }
+}
+
+// WriteAt implements p9.File.
+func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
+ if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+ return 0, syscall.EBADF
+ }
+ if !l.isOpen() {
+ return 0, syscall.EBADF
+ }
+
+ w, err := l.file.WriteAt(p, int64(offset))
+ if err != nil {
+ return w, extractErrno(err)
+ }
+ return w, nil
+}
+
+// Symlink implements p9.File.
+func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return p9.QID{}, syscall.EBADF
+ }
+
+ if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ cu := specutils.MakeCleanup(func() {
+ // Best effort attempt to remove the symlink in case of failure.
+ if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
+ }
+ })
+ defer cu.Clean()
+
+ // Open symlink to change ownership and stat it.
+ f, err := fd.OpenAt(l.file, newName, unix.O_PATH|openFlags, 0)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ defer f.Close()
+
+ if err := fchown(f.FD(), uid, gid); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ stat, err := stat(f.FD())
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+
+ cu.Release()
+ return l.attachPoint.makeQID(stat), nil
+}
+
+// Link implements p9.File.
+func (l *localFile) Link(target p9.File, newName string) error {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return syscall.EBADF
+ }
+
+ targetFile := target.(*localFile)
+ if err := unix.Linkat(targetFile.file.FD(), "", l.file.FD(), newName, linux.AT_EMPTY_PATH); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// Mknod implements p9.File.
+//
+// Not implemented.
+func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+ // From mknod(2) man page:
+ // "EPERM: [...] if the filesystem containing pathname does not support
+ // the type of node requested."
+ return p9.QID{}, syscall.EPERM
+}
+
+// UnlinkAt implements p9.File.
+func (l *localFile) UnlinkAt(name string, flags uint32) error {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return syscall.EBADF
+ }
+
+ if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// Readdir implements p9.File.
+func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
+ if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+ return nil, syscall.EBADF
+ }
+ if !l.isOpen() {
+ return nil, syscall.EBADF
+ }
+
+ // Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
+ // reading all directory contents. Take a lock because this operation is
+ // stateful.
+ l.readDirMu.Lock()
+ defer l.readDirMu.Unlock()
+
+ skip := uint64(0)
+
+ // Check if the file is at the correct position already. If not, seek to the
+ // beginning and read the entire directory again.
+ if l.lastDirentOffset != offset {
+ if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
+ return nil, extractErrno(err)
+ }
+ skip = offset
+ }
+
+ dirents, err := l.readDirent(l.file.FD(), offset, count, skip)
+ if err == nil {
+ // On success, remember the offset that was returned at the current
+ // position.
+ l.lastDirentOffset = offset + uint64(len(dirents))
+ } else {
+ // On failure, the state is unknown, force call to seek() next time.
+ l.lastDirentOffset = math.MaxUint64
+ }
+ return dirents, err
+}
+
+func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) {
+ // Limit 'count' to cap the slice size that is returned.
+ const maxCount = 100000
+ if count > maxCount {
+ count = maxCount
+ }
+
+ dirents := make([]p9.Dirent, 0, count)
+
+ // Pre-allocate buffers that will be reused to get partial results.
+ direntsBuf := make([]byte, 8192)
+ names := make([]string, 0, 100)
+
+ end := offset + uint64(count)
+ for offset < end {
+ dirSize, err := syscall.ReadDirent(f, direntsBuf)
+ if err != nil {
+ return dirents, err
+ }
+ if dirSize <= 0 {
+ return dirents, nil
+ }
+
+ names := names[:0]
+ _, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names)
+
+ // Skip over entries that the caller is not interested in.
+ if skip > 0 {
+ if skip > uint64(len(names)) {
+ skip -= uint64(len(names))
+ names = names[:0]
+ } else {
+ names = names[skip:]
+ skip = 0
+ }
+ }
+ for _, name := range names {
+ stat, err := statAt(l.file.FD(), name)
+ if err != nil {
+ log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err)
+ continue
+ }
+ qid := l.attachPoint.makeQID(stat)
+ offset++
+ dirents = append(dirents, p9.Dirent{
+ QID: qid,
+ Type: qid.Type,
+ Name: name,
+ Offset: offset,
+ })
+ }
+ }
+ return dirents, nil
+}
+
+// Readlink implements p9.File.
+func (l *localFile) Readlink() (string, error) {
+ // Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
+ const limit = 1024 * 1024
+ for len := 128; len < limit; len *= 2 {
+ b := make([]byte, len)
+ n, err := unix.Readlinkat(l.file.FD(), "", b)
+ if err != nil {
+ return "", extractErrno(err)
+ }
+ if n < len {
+ return string(b[:n]), nil
+ }
+ }
+ return "", syscall.ENOMEM
+}
+
+// Flush implements p9.File.
+func (l *localFile) Flush() error {
+ return nil
+}
+
+// Connect implements p9.File.
+func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
+ return nil, syscall.ECONNREFUSED
+}
+
+// Close implements p9.File.
+func (l *localFile) Close() error {
+ l.mode = invalidMode
+ err := l.file.Close()
+ l.file = nil
+ return err
+}
+
+func (l *localFile) isOpen() bool {
+ return l.mode != invalidMode
+}
+
+// Renamed implements p9.Renamed.
+func (l *localFile) Renamed(newDir p9.File, newName string) {
+ l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
+}
+
+// extractErrno tries to determine the errno.
+func extractErrno(err error) syscall.Errno {
+ if err == nil {
+ // This should never happen. The likely result will be that
+ // some user gets the frustrating "error: SUCCESS" message.
+ log.Warningf("extractErrno called with nil error!")
+ return 0
+ }
+
+ switch err {
+ case os.ErrNotExist:
+ return syscall.ENOENT
+ case os.ErrExist:
+ return syscall.EEXIST
+ case os.ErrPermission:
+ return syscall.EACCES
+ case os.ErrInvalid:
+ return syscall.EINVAL
+ }
+
+ // See if it's an errno or a common wrapped error.
+ switch e := err.(type) {
+ case syscall.Errno:
+ return e
+ case *os.PathError:
+ return extractErrno(e.Err)
+ case *os.LinkError:
+ return extractErrno(e.Err)
+ case *os.SyscallError:
+ return extractErrno(e.Err)
+ }
+
+ // Fall back to EIO.
+ log.Debugf("Unknown error: %v, defaulting to EIO", err)
+ return syscall.EIO
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
new file mode 100644
index 000000000..58af5e44d
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -0,0 +1,107 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+ nameBytes, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return syscall.Stat_t{}, err
+ }
+ namePtr := unsafe.Pointer(nameBytes)
+
+ var stat syscall.Stat_t
+ statPtr := unsafe.Pointer(&stat)
+
+ if _, _, errno := syscall.Syscall6(
+ syscall.SYS_NEWFSTATAT,
+ uintptr(dirFd),
+ uintptr(namePtr),
+ uintptr(statPtr),
+ linux.AT_SYMLINK_NOFOLLOW,
+ 0,
+ 0); errno != 0 {
+
+ return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+ }
+ return stat, nil
+}
+
+func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+ // utimensat(2) doesn't accept empty name, instead name must be nil to make it
+ // operate directly on 'dirFd' unlike other *at syscalls.
+ var namePtr unsafe.Pointer
+ if name != "" {
+ nameBytes, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return err
+ }
+ namePtr = unsafe.Pointer(nameBytes)
+ }
+
+ timesPtr := unsafe.Pointer(&times[0])
+
+ if _, _, errno := syscall.Syscall6(
+ syscall.SYS_UTIMENSAT,
+ uintptr(dirFd),
+ uintptr(namePtr),
+ uintptr(timesPtr),
+ uintptr(flags),
+ 0,
+ 0); errno != 0 {
+
+ return syserr.FromHost(errno).ToError()
+ }
+ return nil
+}
+
+func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error {
+ var oldNamePtr unsafe.Pointer
+ if oldName != "" {
+ nameBytes, err := syscall.BytePtrFromString(oldName)
+ if err != nil {
+ return err
+ }
+ oldNamePtr = unsafe.Pointer(nameBytes)
+ }
+ var newNamePtr unsafe.Pointer
+ if newName != "" {
+ nameBytes, err := syscall.BytePtrFromString(newName)
+ if err != nil {
+ return err
+ }
+ newNamePtr = unsafe.Pointer(nameBytes)
+ }
+
+ if _, _, errno := syscall.Syscall6(
+ syscall.SYS_RENAMEAT,
+ uintptr(oldDirFD),
+ uintptr(oldNamePtr),
+ uintptr(newDirFD),
+ uintptr(newNamePtr),
+ 0,
+ 0); errno != 0 {
+
+ return syserr.FromHost(errno).ToError()
+ }
+ return nil
+}
diff --git a/runsc/main.go b/runsc/main.go
new file mode 100644
index 000000000..11bc73f75
--- /dev/null
+++ b/runsc/main.go
@@ -0,0 +1,279 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary runsc is an implementation of the Open Container Initiative Runtime
+// that runs applications inside a sandbox.
+package main
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strings"
+ "syscall"
+
+ "flag"
+
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/cmd"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+var (
+ // Although these flags are not part of the OCI spec, they are used by
+ // Docker, and thus should not be changed.
+ rootDir = flag.String("root", "", "root directory for storage of container state")
+ logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
+ logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s")
+ debug = flag.Bool("debug", false, "enable debug logging")
+ showVersion = flag.Bool("version", false, "show version and exit")
+
+ // These flags are unique to runsc, and are used to configure parts of the
+ // system that are not covered by the runtime spec.
+
+ // Debugging flags.
+ debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+ logPackets = flag.Bool("log-packets", false, "enable network packet logging")
+ logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.")
+ debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.")
+ debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
+
+ // Debugging flags: strace related
+ strace = flag.Bool("strace", false, "enable strace")
+ straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+ straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
+
+ // Flags that control sandbox runtime behavior.
+ platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+ network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+ gso = flag.Bool("gso", true, "enable generic segmenation offload")
+ fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+ overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+ watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+ panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+ profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+ netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+
+ testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
+)
+
+func main() {
+ // Help and flags commands are generated automatically.
+ subcommands.Register(subcommands.HelpCommand(), "")
+ subcommands.Register(subcommands.FlagsCommand(), "")
+
+ // Register user-facing runsc commands.
+ subcommands.Register(new(cmd.Checkpoint), "")
+ subcommands.Register(new(cmd.Create), "")
+ subcommands.Register(new(cmd.Delete), "")
+ subcommands.Register(new(cmd.Do), "")
+ subcommands.Register(new(cmd.Events), "")
+ subcommands.Register(new(cmd.Exec), "")
+ subcommands.Register(new(cmd.Gofer), "")
+ subcommands.Register(new(cmd.Kill), "")
+ subcommands.Register(new(cmd.List), "")
+ subcommands.Register(new(cmd.Pause), "")
+ subcommands.Register(new(cmd.PS), "")
+ subcommands.Register(new(cmd.Restore), "")
+ subcommands.Register(new(cmd.Resume), "")
+ subcommands.Register(new(cmd.Run), "")
+ subcommands.Register(new(cmd.Spec), "")
+ subcommands.Register(new(cmd.Start), "")
+ subcommands.Register(new(cmd.State), "")
+ subcommands.Register(new(cmd.Wait), "")
+
+ // Register internal commands with the internal group name. This causes
+ // them to be sorted below the user-facing commands with empty group.
+ // The string below will be printed above the commands.
+ const internalGroup = "internal use only"
+ subcommands.Register(new(cmd.Boot), internalGroup)
+ subcommands.Register(new(cmd.Debug), internalGroup)
+ subcommands.Register(new(cmd.Gofer), internalGroup)
+
+ // All subcommands must be registered before flag parsing.
+ flag.Parse()
+
+ // Are we showing the version?
+ if *showVersion {
+ // The format here is the same as runc.
+ fmt.Fprintf(os.Stdout, "runsc version %s\n", version)
+ fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version)
+ os.Exit(0)
+ }
+
+ platformType, err := boot.MakePlatformType(*platform)
+ if err != nil {
+ cmd.Fatalf("%v", err)
+ }
+
+ fsAccess, err := boot.MakeFileAccessType(*fileAccess)
+ if err != nil {
+ cmd.Fatalf("%v", err)
+ }
+
+ if fsAccess == boot.FileAccessShared && *overlay {
+ cmd.Fatalf("overlay flag is incompatible with shared file access")
+ }
+
+ netType, err := boot.MakeNetworkType(*network)
+ if err != nil {
+ cmd.Fatalf("%v", err)
+ }
+
+ wa, err := boot.MakeWatchdogAction(*watchdogAction)
+ if err != nil {
+ cmd.Fatalf("%v", err)
+ }
+
+ // Create a new Config from the flags.
+ conf := &boot.Config{
+ RootDir: *rootDir,
+ Debug: *debug,
+ LogFilename: *logFilename,
+ LogFormat: *logFormat,
+ DebugLog: *debugLog,
+ DebugLogFormat: *debugLogFormat,
+ FileAccess: fsAccess,
+ Overlay: *overlay,
+ Network: netType,
+ GSO: *gso,
+ LogPackets: *logPackets,
+ Platform: platformType,
+ Strace: *strace,
+ StraceLogSize: *straceLogSize,
+ WatchdogAction: wa,
+ PanicSignal: *panicSignal,
+ ProfileEnable: *profile,
+ EnableRaw: *netRaw,
+ TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
+ }
+ if len(*straceSyscalls) != 0 {
+ conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
+ }
+
+ // Set up logging.
+ if *debug {
+ log.SetLevel(log.Debug)
+ }
+
+ subcommand := flag.CommandLine.Arg(0)
+
+ var logFile io.Writer = os.Stderr
+ if *logFD > -1 {
+ logFile = os.NewFile(uintptr(*logFD), "log file")
+ } else if *logFilename != "" {
+ // We must set O_APPEND and not O_TRUNC because Docker passes
+ // the same log file for all commands (and also parses these
+ // log files), so we can't destroy them on each command.
+ f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+ if err != nil {
+ cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+ }
+ logFile = f
+ } else if subcommand == "do" {
+ logFile = ioutil.Discard
+ }
+
+ e := newEmitter(*logFormat, logFile)
+
+ if *debugLogFD > -1 {
+ f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+
+ // Quick sanity check to make sure no other commands get passed
+ // a log fd (they should use log dir instead).
+ if subcommand != "boot" && subcommand != "gofer" {
+ cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
+ }
+
+ // If we are the boot process, then we own our stdio FDs and
+ // can do what we want with them. Since Docker and Containerd
+ // both eat boot's stderr, we dup our stderr to the provided
+ // log FD so that panics will appear in the logs, rather than
+ // just disappear.
+ if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
+ cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
+ }
+
+ if logFile == os.Stderr {
+ // Suppress logging to stderr when debug log is enabled. Otherwise all
+ // messages will be duplicated in the debug log (see Dup2() call above).
+ e = newEmitter(*debugLogFormat, f)
+ } else {
+ e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
+ }
+ } else if *debugLog != "" {
+ f, err := specutils.DebugLogFile(*debugLog, subcommand)
+ if err != nil {
+ cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
+ }
+ e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
+ }
+
+ log.SetTarget(e)
+
+ log.Infof("***************************")
+ log.Infof("Args: %s", os.Args)
+ log.Infof("Version %s", version)
+ log.Infof("PID: %d", os.Getpid())
+ log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
+ log.Infof("Configuration:")
+ log.Infof("\t\tRootDir: %s", conf.RootDir)
+ log.Infof("\t\tPlatform: %v", conf.Platform)
+ log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
+ log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
+ log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+ log.Infof("***************************")
+
+ // Call the subcommand and pass in the configuration.
+ var ws syscall.WaitStatus
+ subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+ if subcmdCode == subcommands.ExitSuccess {
+ log.Infof("Exiting with status: %v", ws)
+ if ws.Signaled() {
+ // No good way to return it, emulate what the shell does. Maybe raise
+ // signall to self?
+ os.Exit(128 + int(ws.Signal()))
+ }
+ os.Exit(ws.ExitStatus())
+ }
+ // Return an error that is unlikely to be used by the application.
+ log.Warningf("Failure to execute command, err: %v", subcmdCode)
+ os.Exit(128)
+}
+
+func newEmitter(format string, logFile io.Writer) log.Emitter {
+ switch format {
+ case "text":
+ return &log.GoogleEmitter{&log.Writer{Next: logFile}}
+ case "json":
+ return &log.JSONEmitter{log.Writer{Next: logFile}}
+ case "json-k8s":
+ return &log.K8sJSONEmitter{log.Writer{Next: logFile}}
+ }
+ cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
+ panic("unreachable")
+}
+
+func init() {
+ // Set default root dir to something (hopefully) user-writeable.
+ *rootDir = "/var/run/runsc"
+ if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+ *rootDir = filepath.Join(runtimeDir, "runsc")
+ }
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
new file mode 100644
index 000000000..0460d5f1a
--- /dev/null
+++ b/runsc/sandbox/network.go
@@ -0,0 +1,375 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+ "fmt"
+ "net"
+ "os"
+ "path/filepath"
+ "runtime"
+ "strconv"
+ "strings"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/vishvananda/netlink"
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ // Annotations used to indicate whether the container corresponds to a
+ // pod or a container within a pod.
+ crioContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType"
+ containerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+)
+
+// setupNetwork configures the network stack to mimic the local network
+// configuration. Docker uses network namespaces with vnets to configure the
+// network for the container. The untrusted app expects to see the same network
+// inside the sandbox. Routing and port mapping is handled directly by docker
+// with most of network information not even available to the runtime.
+//
+// Netstack inside the sandbox speaks directly to the device using a raw socket.
+// All IP addresses assigned to the NIC, are removed and passed on to netstack's
+// device.
+//
+// If 'conf.Network' is NoNetwork, skips local configuration and creates a
+// loopback interface only.
+//
+// Run the following container to test it:
+// docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+ log.Infof("Setting up network")
+
+ switch conf.Network {
+ case boot.NetworkNone:
+ log.Infof("Network is disabled, create loopback interface only")
+ if err := createDefaultLoopbackInterface(conn); err != nil {
+ return fmt.Errorf("creating default loopback interface: %v", err)
+ }
+ case boot.NetworkSandbox:
+ // Build the path to the net namespace of the sandbox process.
+ // This is what we will copy.
+ nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
+ if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil {
+ return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
+ }
+ case boot.NetworkHost:
+ // Nothing to do here.
+ default:
+ return fmt.Errorf("invalid network type: %d", conf.Network)
+ }
+ return nil
+}
+
+func createDefaultLoopbackInterface(conn *urpc.Client) error {
+ link := boot.LoopbackLink{
+ Name: "lo",
+ Addresses: []net.IP{
+ net.IP("\x7f\x00\x00\x01"),
+ net.IPv6loopback,
+ },
+ Routes: []boot.Route{
+ {
+ Destination: net.IP("\x7f\x00\x00\x00"),
+ Mask: net.IPMask("\xff\x00\x00\x00"),
+ },
+ {
+ Destination: net.IPv6loopback,
+ Mask: net.IPMask(strings.Repeat("\xff", 16)),
+ },
+ },
+ }
+ if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
+ LoopbackLinks: []boot.LoopbackLink{link},
+ }, nil); err != nil {
+ return fmt.Errorf("creating loopback link and routes: %v", err)
+ }
+ return nil
+}
+
+func joinNetNS(nsPath string) (func(), error) {
+ runtime.LockOSThread()
+ restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{
+ Type: specs.NetworkNamespace,
+ Path: nsPath,
+ })
+ if err != nil {
+ runtime.UnlockOSThread()
+ return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err)
+ }
+ return func() {
+ restoreNS()
+ runtime.UnlockOSThread()
+ }, nil
+}
+
+// isRootNS determines whether we are running in the root net namespace.
+// /proc/sys/net/core/rmem_default only exists in root network namespace.
+func isRootNS() (bool, error) {
+ err := syscall.Access("/proc/sys/net/core/rmem_default", syscall.F_OK)
+ switch err {
+ case nil:
+ return true, nil
+ case syscall.ENOENT:
+ return false, nil
+ default:
+ return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err)
+ }
+}
+
+// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
+// net namespace with the given path, creates them in the sandbox, and removes
+// them from the host.
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error {
+ // Join the network namespace that we will be copying.
+ restore, err := joinNetNS(nsPath)
+ if err != nil {
+ return err
+ }
+ defer restore()
+
+ // Get all interfaces in the namespace.
+ ifaces, err := net.Interfaces()
+ if err != nil {
+ return fmt.Errorf("querying interfaces: %v", err)
+ }
+
+ isRoot, err := isRootNS()
+ if err != nil {
+ return err
+ }
+ if isRoot {
+
+ return fmt.Errorf("cannot run with network enabled in root network namespace")
+ }
+
+ // Collect addresses and routes from the interfaces.
+ var args boot.CreateLinksAndRoutesArgs
+ for _, iface := range ifaces {
+ if iface.Flags&net.FlagUp == 0 {
+ log.Infof("Skipping down interface: %+v", iface)
+ continue
+ }
+
+ allAddrs, err := iface.Addrs()
+ if err != nil {
+ return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err)
+ }
+
+ // We build our own loopback devices.
+ if iface.Flags&net.FlagLoopback != 0 {
+ links, err := loopbackLinks(iface, allAddrs)
+ if err != nil {
+ return fmt.Errorf("getting loopback routes and links for iface %q: %v", iface.Name, err)
+ }
+ args.LoopbackLinks = append(args.LoopbackLinks, links...)
+ continue
+ }
+
+ // Keep only IPv4 addresses.
+ var ip4addrs []*net.IPNet
+ for _, ifaddr := range allAddrs {
+ ipNet, ok := ifaddr.(*net.IPNet)
+ if !ok {
+ return fmt.Errorf("address is not IPNet: %+v", ifaddr)
+ }
+ if ipNet.IP.To4() == nil {
+ log.Warningf("IPv6 is not supported, skipping: %v", ipNet)
+ continue
+ }
+ ip4addrs = append(ip4addrs, ipNet)
+ }
+ if len(ip4addrs) == 0 {
+ log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name)
+ continue
+ }
+
+ // Create the socket.
+ const protocol = 0x0300 // htons(ETH_P_ALL)
+ fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+ if err != nil {
+ return fmt.Errorf("unable to create raw socket: %v", err)
+ }
+ deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+
+ // Bind to the appropriate device.
+ ll := syscall.SockaddrLinklayer{
+ Protocol: protocol,
+ Ifindex: iface.Index,
+ Hatype: 0, // No ARP type.
+ Pkttype: syscall.PACKET_OTHERHOST,
+ }
+ if err := syscall.Bind(fd, &ll); err != nil {
+ return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+ }
+
+ // Scrape the routes before removing the address, since that
+ // will remove the routes as well.
+ routes, def, err := routesForIface(iface)
+ if err != nil {
+ return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
+ }
+ if def != nil {
+ if !args.DefaultGateway.Route.Empty() {
+ return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+ }
+ args.DefaultGateway.Route = *def
+ args.DefaultGateway.Name = iface.Name
+ }
+
+ link := boot.FDBasedLink{
+ Name: iface.Name,
+ MTU: iface.MTU,
+ Routes: routes,
+ }
+
+ // Get the link for the interface.
+ ifaceLink, err := netlink.LinkByName(iface.Name)
+ if err != nil {
+ return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
+ }
+ link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr)
+
+ if enableGSO {
+ gso, err := isGSOEnabled(fd, iface.Name)
+ if err != nil {
+ return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+ }
+ if gso {
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+ return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+ }
+ link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize
+ } else {
+ log.Infof("GSO not available in host.")
+ }
+ }
+
+ // Use SO_RCVBUFFORCE because on linux the receive buffer for an
+ // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
+ // defaults to a unusually low value of 208KB. This is too low
+ // for gVisor to be able to receive packets at high throughputs
+ // without incurring packet drops.
+ const rcvBufSize = 4 << 20 // 4MB.
+
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
+ return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+ }
+
+ // Collect the addresses for the interface, enable forwarding,
+ // and remove them from the host.
+ for _, addr := range ip4addrs {
+ link.Addresses = append(link.Addresses, addr.IP)
+
+ // Steal IP address from NIC.
+ if err := removeAddress(ifaceLink, addr.String()); err != nil {
+ return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err)
+ }
+ }
+
+ args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
+ args.FDBasedLinks = append(args.FDBasedLinks, link)
+ }
+
+ log.Debugf("Setting up network, config: %+v", args)
+ if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
+ return fmt.Errorf("creating links and routes: %v", err)
+ }
+ return nil
+}
+
+// loopbackLinks collects the links for a loopback interface.
+func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
+ var links []boot.LoopbackLink
+ for _, addr := range addrs {
+ ipNet, ok := addr.(*net.IPNet)
+ if !ok {
+ return nil, fmt.Errorf("address is not IPNet: %+v", addr)
+ }
+ links = append(links, boot.LoopbackLink{
+ Name: iface.Name,
+ Addresses: []net.IP{ipNet.IP},
+ Routes: []boot.Route{{
+ Destination: ipNet.IP.Mask(ipNet.Mask),
+ Mask: ipNet.Mask,
+ }},
+ })
+ }
+ return links, nil
+}
+
+// routesForIface iterates over all routes for the given interface and converts
+// them to boot.Routes.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+ link, err := netlink.LinkByIndex(iface.Index)
+ if err != nil {
+ return nil, nil, err
+ }
+ rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
+ if err != nil {
+ return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
+ }
+
+ var def *boot.Route
+ var routes []boot.Route
+ for _, r := range rs {
+ // Is it a default route?
+ if r.Dst == nil {
+ if r.Gw == nil {
+ return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
+ }
+ if r.Gw.To4() == nil {
+ log.Warningf("IPv6 is not supported, skipping default route: %v", r)
+ continue
+ }
+ if def != nil {
+ return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+ }
+ // Create a catch all route to the gateway.
+ def = &boot.Route{
+ Destination: net.IPv4zero,
+ Mask: net.IPMask(net.IPv4zero),
+ Gateway: r.Gw,
+ }
+ continue
+ }
+ if r.Dst.IP.To4() == nil {
+ log.Warningf("IPv6 is not supported, skipping route: %v", r)
+ continue
+ }
+ routes = append(routes, boot.Route{
+ Destination: r.Dst.IP.Mask(r.Dst.Mask),
+ Mask: r.Dst.Mask,
+ Gateway: r.Gw,
+ })
+ }
+ return routes, def, nil
+}
+
+// removeAddress removes IP address from network device. It's equivalent to:
+// ip addr del <ipAndMask> dev <name>
+func removeAddress(source netlink.Link, ipAndMask string) error {
+ addr, err := netlink.ParseAddr(ipAndMask)
+ if err != nil {
+ return err
+ }
+ return netlink.AddrDel(source, addr)
+}
diff --git a/runsc/sandbox/network_unsafe.go b/runsc/sandbox/network_unsafe.go
new file mode 100644
index 000000000..2a2a0fb7e
--- /dev/null
+++ b/runsc/sandbox/network_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+ "syscall"
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+)
+
+type ethtoolValue struct {
+ cmd uint32
+ val uint32
+}
+
+type ifreq struct {
+ ifrName [unix.IFNAMSIZ]byte
+ ifrData *ethtoolValue
+}
+
+const (
+ _ETHTOOL_GGSO = 0x00000023
+)
+
+func isGSOEnabled(fd int, intf string) (bool, error) {
+ val := ethtoolValue{
+ cmd: _ETHTOOL_GGSO,
+ }
+
+ var name [unix.IFNAMSIZ]byte
+ copy(name[:], []byte(intf))
+
+ ifr := ifreq{
+ ifrName: name,
+ ifrData: &val,
+ }
+
+ if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), unix.SIOCETHTOOL, uintptr(unsafe.Pointer(&ifr))); err != 0 {
+ return false, err
+ }
+
+ return val.val != 0, nil
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
new file mode 100644
index 000000000..47a66afb2
--- /dev/null
+++ b/runsc/sandbox/sandbox.go
@@ -0,0 +1,992 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sandbox creates and manipulates sandboxes.
+package sandbox
+
+import (
+ "context"
+ "fmt"
+ "os"
+ "os/exec"
+ "strconv"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/cenkalti/backoff"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/syndtr/gocapability/capability"
+ "gvisor.googlesource.com/gvisor/pkg/control/client"
+ "gvisor.googlesource.com/gvisor/pkg/control/server"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/cgroup"
+ "gvisor.googlesource.com/gvisor/runsc/console"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Sandbox wraps a sandbox process.
+//
+// It is used to start/stop sandbox process (and associated processes like
+// gofers), as well as for running and manipulating containers inside a running
+// sandbox.
+//
+// Note: Sandbox must be immutable because a copy of it is saved for each
+// container and changes would not be synchronized to all of them.
+type Sandbox struct {
+ // ID is the id of the sandbox (immutable). By convention, this is the same
+ // ID as the first container run in the sandbox.
+ ID string `json:"id"`
+
+ // Pid is the pid of the running sandbox (immutable). May be 0 is the sandbox
+ // is not running.
+ Pid int `json:"pid"`
+
+ // Cgroup has the cgroup configuration for the sandbox.
+ Cgroup *cgroup.Cgroup `json:"cgroup"`
+
+ // child is set if a sandbox process is a child of the current process.
+ //
+ // This field isn't saved to json, because only a creator of sandbox
+ // will have it as a child process.
+ child bool
+
+ // status is an exit status of a sandbox process.
+ status syscall.WaitStatus
+
+ // statusMu protects status.
+ statusMu sync.Mutex
+}
+
+// New creates the sandbox process. The caller must call Destroy() on the
+// sandbox.
+func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, specFile *os.File, cg *cgroup.Cgroup) (*Sandbox, error) {
+ s := &Sandbox{ID: id, Cgroup: cg}
+ // The Cleanup object cleans up partially created sandboxes when an error
+ // occurs. Any errors occurring during cleanup itself are ignored.
+ c := specutils.MakeCleanup(func() {
+ err := s.destroy()
+ log.Warningf("error destroying sandbox: %v", err)
+ })
+ defer c.Clean()
+
+ // Create pipe to synchronize when sandbox process has been booted.
+ clientSyncFile, sandboxSyncFile, err := os.Pipe()
+ if err != nil {
+ return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
+ }
+ defer clientSyncFile.Close()
+
+ // Create the sandbox process.
+ err = s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles, specFile, sandboxSyncFile)
+ // sandboxSyncFile has to be closed to be able to detect when the sandbox
+ // process exits unexpectedly.
+ sandboxSyncFile.Close()
+ if err != nil {
+ return nil, err
+ }
+
+ // Wait until the sandbox has booted.
+ b := make([]byte, 1)
+ if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
+ return nil, fmt.Errorf("waiting for sandbox to start: %v", err)
+ }
+
+ c.Release()
+ return s, nil
+}
+
+// CreateContainer creates a non-root container inside the sandbox.
+func (s *Sandbox) CreateContainer(cid string) error {
+ log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
+ sandboxConn, err := s.sandboxConnect()
+ if err != nil {
+ return fmt.Errorf("couldn't connect to sandbox: %v", err)
+ }
+ defer sandboxConn.Close()
+
+ if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil {
+ return fmt.Errorf("creating non-root container %q: %v", cid, err)
+ }
+ return nil
+}
+
+// StartRoot starts running the root container process inside the sandbox.
+func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
+ log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ // Configure the network.
+ if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
+ return fmt.Errorf("setting up network: %v", err)
+ }
+
+ // Send a message to the sandbox control server to start the root
+ // container.
+ if err := conn.Call(boot.RootContainerStart, &s.ID, nil); err != nil {
+ return fmt.Errorf("starting root container: %v", err)
+ }
+
+ return nil
+}
+
+// StartContainer starts running a non-root container inside the sandbox.
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+ for _, f := range goferFiles {
+ defer f.Close()
+ }
+
+ log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
+ sandboxConn, err := s.sandboxConnect()
+ if err != nil {
+ return fmt.Errorf("couldn't connect to sandbox: %v", err)
+ }
+ defer sandboxConn.Close()
+
+ // The payload must container stdin/stdout/stderr followed by gofer
+ // files.
+ files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...)
+ // Start running the container.
+ args := boot.StartArgs{
+ Spec: spec,
+ Conf: conf,
+ CID: cid,
+ FilePayload: urpc.FilePayload{Files: files},
+ }
+ if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
+ return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err)
+ }
+ return nil
+}
+
+// Restore sends the restore call for a container in the sandbox.
+func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error {
+ log.Debugf("Restore sandbox %q", s.ID)
+
+ rf, err := os.Open(filename)
+ if err != nil {
+ return fmt.Errorf("opening restore file %q failed: %v", filename, err)
+ }
+ defer rf.Close()
+
+ opt := boot.RestoreOpts{
+ FilePayload: urpc.FilePayload{
+ Files: []*os.File{rf},
+ },
+ SandboxID: s.ID,
+ }
+
+ // If the platform needs a device FD we must pass it in.
+ if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+ return err
+ } else if deviceFile != nil {
+ defer deviceFile.Close()
+ opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile)
+ }
+
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ // Configure the network.
+ if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
+ return fmt.Errorf("setting up network: %v", err)
+ }
+
+ // Restore the container and start the root container.
+ if err := conn.Call(boot.ContainerRestore, &opt, nil); err != nil {
+ return fmt.Errorf("restoring container %q: %v", cid, err)
+ }
+
+ return nil
+}
+
+// Processes retrieves the list of processes and associated metadata for a
+// given container in this sandbox.
+func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
+ log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return nil, err
+ }
+ defer conn.Close()
+
+ var pl []*control.Process
+ if err := conn.Call(boot.ContainerProcesses, &cid, &pl); err != nil {
+ return nil, fmt.Errorf("retrieving process data from sandbox: %v", err)
+ }
+ return pl, nil
+}
+
+// Execute runs the specified command in the container. It returns the PID of
+// the newly created process.
+func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
+ log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return 0, s.connError(err)
+ }
+ defer conn.Close()
+
+ // Send a message to the sandbox control server to start the container.
+ var pid int32
+ if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil {
+ return 0, fmt.Errorf("executing command %q in sandbox: %v", args, err)
+ }
+ return pid, nil
+}
+
+// Event retrieves stats about the sandbox such as memory and CPU utilization.
+func (s *Sandbox) Event(cid string) (*boot.Event, error) {
+ log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return nil, err
+ }
+ defer conn.Close()
+
+ var e boot.Event
+ // TODO(b/129292330): Pass in the container id (cid) here. The sandbox
+ // should return events only for that container.
+ if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil {
+ return nil, fmt.Errorf("retrieving event data from sandbox: %v", err)
+ }
+ e.ID = cid
+ return &e, nil
+}
+
+func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
+ log.Debugf("Connecting to sandbox %q", s.ID)
+ conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
+ if err != nil {
+ return nil, s.connError(err)
+ }
+ return conn, nil
+}
+
+func (s *Sandbox) connError(err error) error {
+ return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid, err)
+}
+
+// createSandboxProcess starts the sandbox as a subprocess by running the "boot"
+// command, passing in the bundle dir.
+func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, mountsFile, startSyncFile *os.File) error {
+ // nextFD is used to get unused FDs that we can pass to the sandbox. It
+ // starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
+ nextFD := 3
+
+ binPath := specutils.ExePath
+ cmd := exec.Command(binPath, conf.ToFlags()...)
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+
+ // Open the log files to pass to the sandbox as FDs.
+ //
+ // These flags must come BEFORE the "boot" command in cmd.Args.
+ if conf.LogFilename != "" {
+ logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+ if err != nil {
+ return fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
+ }
+ defer logFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, logFile)
+ cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+ if conf.DebugLog != "" {
+ debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot")
+ if err != nil {
+ return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
+ }
+ defer debugLogFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile)
+ cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ // Add the "boot" command to the args.
+ //
+ // All flags after this must be for the boot command
+ cmd.Args = append(cmd.Args, "boot", "--bundle="+bundleDir)
+
+ // Create a socket for the control server and donate it to the sandbox.
+ addr := boot.ControlSocketAddr(s.ID)
+ sockFD, err := server.CreateSocket(addr)
+ log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
+ if err != nil {
+ return fmt.Errorf("creating control server socket for sandbox %q: %v", s.ID, err)
+ }
+ controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket")
+ defer controllerFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
+ cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD))
+ nextFD++
+
+ defer mountsFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, mountsFile)
+ cmd.Args = append(cmd.Args, "--mounts-fd="+strconv.Itoa(nextFD))
+ nextFD++
+
+ specFile, err := specutils.OpenSpec(bundleDir)
+ if err != nil {
+ return err
+ }
+ defer specFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, specFile)
+ cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD))
+ nextFD++
+
+ cmd.ExtraFiles = append(cmd.ExtraFiles, startSyncFile)
+ cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD))
+ nextFD++
+
+ // If there is a gofer, sends all socket ends to the sandbox.
+ for _, f := range ioFiles {
+ defer f.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+ cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ // If the platform needs a device FD we must pass it in.
+ if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+ return err
+ } else if deviceFile != nil {
+ defer deviceFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile)
+ cmd.Args = append(cmd.Args, "--device-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ // The current process' stdio must be passed to the application via the
+ // --stdio-fds flag. The stdio of the sandbox process itself must not
+ // be connected to the same FDs, otherwise we risk leaking sandbox
+ // errors to the application, so we set the sandbox stdio to nil,
+ // causing them to read/write from the null device.
+ cmd.Stdin = nil
+ cmd.Stdout = nil
+ cmd.Stderr = nil
+
+ // If the console control socket file is provided, then create a new
+ // pty master/slave pair and set the TTY on the sandbox process.
+ if consoleSocket != "" {
+ cmd.Args = append(cmd.Args, "--console=true")
+
+ // console.NewWithSocket will send the master on the given
+ // socket, and return the slave.
+ tty, err := console.NewWithSocket(consoleSocket)
+ if err != nil {
+ return fmt.Errorf("setting up console with socket %q: %v", consoleSocket, err)
+ }
+ defer tty.Close()
+
+ // Set the TTY as a controlling TTY on the sandbox process.
+ // Note that the Ctty field must be the FD of the TTY in the
+ // *new* process, not this process. Since we are about to
+ // assign the TTY to nextFD, we can use that value here.
+ // stdin, we can use FD 0 here.
+ cmd.SysProcAttr.Setctty = true
+ cmd.SysProcAttr.Ctty = nextFD
+
+ // Pass the tty as all stdio fds to sandbox.
+ for i := 0; i < 3; i++ {
+ cmd.ExtraFiles = append(cmd.ExtraFiles, tty)
+ cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ if conf.Debug {
+ // If debugging, send the boot process stdio to the
+ // TTY, so that it is easier to find.
+ cmd.Stdin = tty
+ cmd.Stdout = tty
+ cmd.Stderr = tty
+ }
+ } else {
+ // If not using a console, pass our current stdio as the
+ // container stdio via flags.
+ for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
+ cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+ cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ if conf.Debug {
+ // If debugging, send the boot process stdio to the
+ // this process' stdio, so that is is easier to find.
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ }
+ }
+
+ // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
+ // when re-parented.
+ cmd.SysProcAttr.Setsid = true
+
+ // nss is the set of namespaces to join or create before starting the sandbox
+ // process. Mount, IPC and UTS namespaces from the host are not used as they
+ // are virtualized inside the sandbox. Be paranoid and run inside an empty
+ // namespace for these. Don't unshare cgroup because sandbox is added to a
+ // cgroup in the caller's namespace.
+ log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces")
+ nss := []specs.LinuxNamespace{
+ {Type: specs.IPCNamespace},
+ {Type: specs.MountNamespace},
+ {Type: specs.UTSNamespace},
+ }
+
+ if conf.Platform == boot.PlatformPtrace {
+ // TODO(b/75837838): Also set a new PID namespace so that we limit
+ // access to other host processes.
+ log.Infof("Sandbox will be started in the current PID namespace")
+ } else {
+ log.Infof("Sandbox will be started in a new PID namespace")
+ nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
+ cmd.Args = append(cmd.Args, "--pidns=true")
+ }
+
+ // Joins the network namespace if network is enabled. the sandbox talks
+ // directly to the host network, which may have been configured in the
+ // namespace.
+ if ns, ok := specutils.GetNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone {
+ log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
+ nss = append(nss, ns)
+ } else if conf.Network == boot.NetworkHost {
+ log.Infof("Sandbox will be started in the host network namespace")
+ } else {
+ log.Infof("Sandbox will be started in new network namespace")
+ nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
+ }
+
+ // User namespace depends on the network type. Host network requires to run
+ // inside the user namespace specified in the spec or the current namespace
+ // if none is configured.
+ if conf.Network == boot.NetworkHost {
+ if userns, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
+ log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
+ nss = append(nss, userns)
+ specutils.SetUIDGIDMappings(cmd, spec)
+ } else {
+ log.Infof("Sandbox will be started in the current user namespace")
+ }
+ // When running in the caller's defined user namespace, apply the same
+ // capabilities to the sandbox process to ensure it abides to the same
+ // rules.
+ cmd.Args = append(cmd.Args, "--apply-caps=true")
+
+ // If we have CAP_SYS_ADMIN, we can create an empty chroot and
+ // bind-mount the executable inside it.
+ if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+ log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
+
+ } else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) {
+ log.Infof("Sandbox will be started in minimal chroot")
+ cmd.Args = append(cmd.Args, "--setup-root")
+ } else {
+ return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
+ }
+ } else {
+ // If we have CAP_SETUID and CAP_SETGID, then we can also run
+ // as user nobody.
+ if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+ log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
+ log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
+ } else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
+ log.Infof("Sandbox will be started in new user namespace")
+ nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+
+ // Map nobody in the new namespace to nobody in the parent namespace.
+ //
+ // A sandbox process will construct an empty
+ // root for itself, so it has to have the CAP_SYS_ADMIN
+ // capability.
+ //
+ // FIXME(b/122554829): The current implementations of
+ // os/exec doesn't allow to set ambient capabilities if
+ // a process is started in a new user namespace. As a
+ // workaround, we start the sandbox process with the 0
+ // UID and then it constructs a chroot and sets UID to
+ // nobody. https://github.com/golang/go/issues/2315
+ const nobody = 65534
+ cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+ {
+ ContainerID: int(0),
+ HostID: int(nobody - 1),
+ Size: int(1),
+ },
+ {
+ ContainerID: int(nobody),
+ HostID: int(nobody),
+ Size: int(1),
+ },
+ }
+ cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+ {
+ ContainerID: int(nobody),
+ HostID: int(nobody),
+ Size: int(1),
+ },
+ }
+
+ // Set credentials to run as user and group nobody.
+ cmd.SysProcAttr.Credential = &syscall.Credential{
+ Uid: 0,
+ Gid: nobody,
+ }
+ cmd.Args = append(cmd.Args, "--setup-root")
+ } else {
+ return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
+ }
+ }
+
+ cmd.Args[0] = "runsc-sandbox"
+
+ if s.Cgroup != nil {
+ cpuNum, err := s.Cgroup.NumCPU()
+ if err != nil {
+ return fmt.Errorf("getting cpu count from cgroups: %v", err)
+ }
+ cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
+
+ mem, err := s.Cgroup.MemoryLimit()
+ if err != nil {
+ return fmt.Errorf("getting memory limit from cgroups: %v", err)
+ }
+ // When memory limit is unset, a "large" number is returned. In that case,
+ // just stick with the default.
+ if mem < 0x7ffffffffffff000 {
+ cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10))
+ }
+ }
+
+ if userLog != "" {
+ f, err := os.OpenFile(userLog, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+ if err != nil {
+ return fmt.Errorf("opening compat log file: %v", err)
+ }
+ defer f.Close()
+
+ cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+ cmd.Args = append(cmd.Args, "--user-log-fd", strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ // Add container as the last argument.
+ cmd.Args = append(cmd.Args, s.ID)
+
+ // Log the FDs we are donating to the sandbox process.
+ for i, f := range cmd.ExtraFiles {
+ log.Debugf("Donating FD %d: %q", i+3, f.Name())
+ }
+
+ log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
+ log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
+ if err := specutils.StartInNS(cmd, nss); err != nil {
+ return fmt.Errorf("Sandbox: %v", err)
+ }
+ s.child = true
+ s.Pid = cmd.Process.Pid
+ log.Infof("Sandbox started, PID: %d", s.Pid)
+
+ return nil
+}
+
+// Wait waits for the containerized process to exit, and returns its WaitStatus.
+func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
+ log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
+ var ws syscall.WaitStatus
+
+ if conn, err := s.sandboxConnect(); err != nil {
+ // The sandbox may have exited while before we had a chance to
+ // wait on it.
+ log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+ } else {
+ defer conn.Close()
+ // Try the Wait RPC to the sandbox.
+ err = conn.Call(boot.ContainerWait, &cid, &ws)
+ if err == nil {
+ // It worked!
+ return ws, nil
+ }
+ // The sandbox may have exited after we connected, but before
+ // or during the Wait RPC.
+ log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+ }
+
+ // The sandbox may have already exited, or exited while handling the
+ // Wait RPC. The best we can do is ask Linux what the sandbox exit
+ // status was, since in most cases that will be the same as the
+ // container exit status.
+ if err := s.waitForStopped(); err != nil {
+ return ws, err
+ }
+ if !s.child {
+ return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable")
+ }
+ return s.status, nil
+}
+
+// WaitPID waits for process 'pid' in the container's sandbox and returns its
+// WaitStatus.
+func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+ log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
+ var ws syscall.WaitStatus
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return ws, err
+ }
+ defer conn.Close()
+
+ args := &boot.WaitPIDArgs{
+ PID: pid,
+ CID: cid,
+ ClearStatus: clearStatus,
+ }
+ if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
+ return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
+ }
+ return ws, nil
+}
+
+// IsRootContainer returns true if the specified container ID belongs to the
+// root container.
+func (s *Sandbox) IsRootContainer(cid string) bool {
+ return s.ID == cid
+}
+
+// Destroy frees all resources associated with the sandbox. It fails fast and
+// is idempotent.
+func (s *Sandbox) destroy() error {
+ log.Debugf("Destroy sandbox %q", s.ID)
+ if s.Pid != 0 {
+ log.Debugf("Killing sandbox %q", s.ID)
+ if err := syscall.Kill(s.Pid, syscall.SIGKILL); err != nil && err != syscall.ESRCH {
+ return fmt.Errorf("killing sandbox %q PID %q: %v", s.ID, s.Pid, err)
+ }
+ if err := s.waitForStopped(); err != nil {
+ return fmt.Errorf("waiting sandbox %q stop: %v", s.ID, err)
+ }
+ }
+
+ return nil
+}
+
+// SignalContainer sends the signal to a container in the sandbox. If all is
+// true and signal is SIGKILL, then waits for all processes to exit before
+// returning.
+func (s *Sandbox) SignalContainer(cid string, sig syscall.Signal, all bool) error {
+ log.Debugf("Signal sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ mode := boot.DeliverToProcess
+ if all {
+ mode = boot.DeliverToAllProcesses
+ }
+
+ args := boot.SignalArgs{
+ CID: cid,
+ Signo: int32(sig),
+ Mode: mode,
+ }
+ if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
+ return fmt.Errorf("signaling container %q: %v", cid, err)
+ }
+ return nil
+}
+
+// SignalProcess sends the signal to a particular process in the container. If
+// fgProcess is true, then the signal is sent to the foreground process group
+// in the same session that PID belongs to. This is only valid if the process
+// is attached to a host TTY.
+func (s *Sandbox) SignalProcess(cid string, pid int32, sig syscall.Signal, fgProcess bool) error {
+ log.Debugf("Signal sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ mode := boot.DeliverToProcess
+ if fgProcess {
+ mode = boot.DeliverToForegroundProcessGroup
+ }
+
+ args := boot.SignalArgs{
+ CID: cid,
+ Signo: int32(sig),
+ PID: pid,
+ Mode: mode,
+ }
+ if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
+ return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err)
+ }
+ return nil
+}
+
+// Checkpoint sends the checkpoint call for a container in the sandbox.
+// The statefile will be written to f.
+func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
+ log.Debugf("Checkpoint sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ opt := control.SaveOpts{
+ FilePayload: urpc.FilePayload{
+ Files: []*os.File{f},
+ },
+ }
+
+ if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil {
+ return fmt.Errorf("checkpointing container %q: %v", cid, err)
+ }
+ return nil
+}
+
+// Pause sends the pause call for a container in the sandbox.
+func (s *Sandbox) Pause(cid string) error {
+ log.Debugf("Pause sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ if err := conn.Call(boot.ContainerPause, nil, nil); err != nil {
+ return fmt.Errorf("pausing container %q: %v", cid, err)
+ }
+ return nil
+}
+
+// Resume sends the resume call for a container in the sandbox.
+func (s *Sandbox) Resume(cid string) error {
+ log.Debugf("Resume sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ if err := conn.Call(boot.ContainerResume, nil, nil); err != nil {
+ return fmt.Errorf("resuming container %q: %v", cid, err)
+ }
+ return nil
+}
+
+// IsRunning returns true if the sandbox or gofer process is running.
+func (s *Sandbox) IsRunning() bool {
+ if s.Pid != 0 {
+ // Send a signal 0 to the sandbox process.
+ if err := syscall.Kill(s.Pid, 0); err == nil {
+ // Succeeded, process is running.
+ return true
+ }
+ }
+ return false
+}
+
+// Stacks collects and returns all stacks for the sandbox.
+func (s *Sandbox) Stacks() (string, error) {
+ log.Debugf("Stacks sandbox %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return "", err
+ }
+ defer conn.Close()
+
+ var stacks string
+ if err := conn.Call(boot.SandboxStacks, nil, &stacks); err != nil {
+ return "", fmt.Errorf("getting sandbox %q stacks: %v", s.ID, err)
+ }
+ return stacks, nil
+}
+
+// HeapProfile writes a heap profile to the given file.
+func (s *Sandbox) HeapProfile(f *os.File) error {
+ log.Debugf("Heap profile %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ opts := control.ProfileOpts{
+ FilePayload: urpc.FilePayload{
+ Files: []*os.File{f},
+ },
+ }
+ if err := conn.Call(boot.HeapProfile, &opts, nil); err != nil {
+ return fmt.Errorf("getting sandbox %q heap profile: %v", s.ID, err)
+ }
+ return nil
+}
+
+// StartCPUProfile start CPU profile writing to the given file.
+func (s *Sandbox) StartCPUProfile(f *os.File) error {
+ log.Debugf("CPU profile start %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ opts := control.ProfileOpts{
+ FilePayload: urpc.FilePayload{
+ Files: []*os.File{f},
+ },
+ }
+ if err := conn.Call(boot.StartCPUProfile, &opts, nil); err != nil {
+ return fmt.Errorf("starting sandbox %q CPU profile: %v", s.ID, err)
+ }
+ return nil
+}
+
+// StopCPUProfile stops a previously started CPU profile.
+func (s *Sandbox) StopCPUProfile() error {
+ log.Debugf("CPU profile stop %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ if err := conn.Call(boot.StopCPUProfile, nil, nil); err != nil {
+ return fmt.Errorf("stopping sandbox %q CPU profile: %v", s.ID, err)
+ }
+ return nil
+}
+
+// StartTrace start trace writing to the given file.
+func (s *Sandbox) StartTrace(f *os.File) error {
+ log.Debugf("Trace start %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ opts := control.ProfileOpts{
+ FilePayload: urpc.FilePayload{
+ Files: []*os.File{f},
+ },
+ }
+ if err := conn.Call(boot.StartTrace, &opts, nil); err != nil {
+ return fmt.Errorf("starting sandbox %q trace: %v", s.ID, err)
+ }
+ return nil
+}
+
+// StopTrace stops a previously started trace..
+func (s *Sandbox) StopTrace() error {
+ log.Debugf("Trace stop %q", s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ if err := conn.Call(boot.StopTrace, nil, nil); err != nil {
+ return fmt.Errorf("stopping sandbox %q trace: %v", s.ID, err)
+ }
+ return nil
+}
+
+// DestroyContainer destroys the given container. If it is the root container,
+// then the entire sandbox is destroyed.
+func (s *Sandbox) DestroyContainer(cid string) error {
+ if s.IsRootContainer(cid) {
+ log.Debugf("Destroying root container %q by destroying sandbox", cid)
+ return s.destroy()
+ }
+
+ if !s.IsRunning() {
+ // Sandbox isn't running anymore, container is already destroyed.
+ return nil
+ }
+
+ log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
+ conn, err := s.sandboxConnect()
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+ if err := conn.Call(boot.ContainerDestroy, &cid, nil); err != nil {
+ return fmt.Errorf("destroying container %q: %v", cid, err)
+ }
+ return nil
+}
+
+func (s *Sandbox) waitForStopped() error {
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+ op := func() error {
+ if s.child {
+ s.statusMu.Lock()
+ defer s.statusMu.Unlock()
+ if s.Pid == 0 {
+ return nil
+ }
+ // The sandbox process is a child of the current process,
+ // so we can wait it and collect its zombie.
+ wpid, err := syscall.Wait4(int(s.Pid), &s.status, syscall.WNOHANG, nil)
+ if err != nil {
+ return fmt.Errorf("error waiting the sandbox process: %v", err)
+ }
+ if wpid == 0 {
+ return fmt.Errorf("sandbox is still running")
+ }
+ s.Pid = 0
+ } else if s.IsRunning() {
+ return fmt.Errorf("sandbox is still running")
+ }
+ return nil
+ }
+ return backoff.Retry(op, b)
+}
+
+// deviceFileForPlatform opens the device file for the given platform. If the
+// platform does not need a device file, then nil is returned.
+func deviceFileForPlatform(p boot.PlatformType) (*os.File, error) {
+ var (
+ f *os.File
+ err error
+ )
+ switch p {
+ case boot.PlatformKVM:
+ f, err = kvm.OpenDevice()
+ default:
+ return nil, nil
+ }
+ if err != nil {
+ return nil, fmt.Errorf("opening device file for platform %q: %v", p, err)
+ }
+ return f, err
+}
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
new file mode 100644
index 000000000..1f3afb4e4
--- /dev/null
+++ b/runsc/specutils/fs.go
@@ -0,0 +1,137 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+ "fmt"
+ "path"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+type mapping struct {
+ set bool
+ val uint32
+}
+
+// optionsMap maps mount propagation-related OCI filesystem options to mount(2)
+// syscall flags.
+var optionsMap = map[string]mapping{
+ "acl": {set: true, val: syscall.MS_POSIXACL},
+ "async": {set: false, val: syscall.MS_SYNCHRONOUS},
+ "atime": {set: false, val: syscall.MS_NOATIME},
+ "bind": {set: true, val: syscall.MS_BIND},
+ "defaults": {set: true, val: 0},
+ "dev": {set: false, val: syscall.MS_NODEV},
+ "diratime": {set: false, val: syscall.MS_NODIRATIME},
+ "dirsync": {set: true, val: syscall.MS_DIRSYNC},
+ "exec": {set: false, val: syscall.MS_NOEXEC},
+ "noexec": {set: true, val: syscall.MS_NOEXEC},
+ "iversion": {set: true, val: syscall.MS_I_VERSION},
+ "loud": {set: false, val: syscall.MS_SILENT},
+ "mand": {set: true, val: syscall.MS_MANDLOCK},
+ "noacl": {set: false, val: syscall.MS_POSIXACL},
+ "noatime": {set: true, val: syscall.MS_NOATIME},
+ "nodev": {set: true, val: syscall.MS_NODEV},
+ "nodiratime": {set: true, val: syscall.MS_NODIRATIME},
+ "noiversion": {set: false, val: syscall.MS_I_VERSION},
+ "nomand": {set: false, val: syscall.MS_MANDLOCK},
+ "norelatime": {set: false, val: syscall.MS_RELATIME},
+ "nostrictatime": {set: false, val: syscall.MS_STRICTATIME},
+ "nosuid": {set: true, val: syscall.MS_NOSUID},
+ "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC},
+ "relatime": {set: true, val: syscall.MS_RELATIME},
+ "remount": {set: true, val: syscall.MS_REMOUNT},
+ "ro": {set: true, val: syscall.MS_RDONLY},
+ "rw": {set: false, val: syscall.MS_RDONLY},
+ "silent": {set: true, val: syscall.MS_SILENT},
+ "strictatime": {set: true, val: syscall.MS_STRICTATIME},
+ "suid": {set: false, val: syscall.MS_NOSUID},
+ "sync": {set: true, val: syscall.MS_SYNCHRONOUS},
+}
+
+// propOptionsMap is similar to optionsMap, but it lists propagation options
+// that cannot be used together with other flags.
+var propOptionsMap = map[string]mapping{
+ "private": {set: true, val: syscall.MS_PRIVATE},
+ "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC},
+ "slave": {set: true, val: syscall.MS_SLAVE},
+ "rslave": {set: true, val: syscall.MS_SLAVE | syscall.MS_REC},
+ "unbindable": {set: true, val: syscall.MS_UNBINDABLE},
+ "runbindable": {set: true, val: syscall.MS_UNBINDABLE | syscall.MS_REC},
+}
+
+// invalidOptions list options not allowed.
+// - shared: sandbox must be isolated from the host. Propagating mount changes
+// from the sandbox to the host breaks the isolation.
+var invalidOptions = []string{"shared", "rshared"}
+
+// OptionsToFlags converts mount options to syscall flags.
+func OptionsToFlags(opts []string) uint32 {
+ return optionsToFlags(opts, optionsMap)
+}
+
+// PropOptionsToFlags converts propagation mount options to syscall flags.
+// Propagation options cannot be set other with other options and must be
+// handled separatedly.
+func PropOptionsToFlags(opts []string) uint32 {
+ return optionsToFlags(opts, propOptionsMap)
+}
+
+func optionsToFlags(opts []string, source map[string]mapping) uint32 {
+ var rv uint32
+ for _, opt := range opts {
+ if m, ok := source[opt]; ok {
+ if m.set {
+ rv |= m.val
+ } else {
+ rv ^= m.val
+ }
+ }
+ }
+ return rv
+}
+
+// ValidateMount validates that spec mounts are correct.
+func validateMount(mnt *specs.Mount) error {
+ if !path.IsAbs(mnt.Destination) {
+ return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
+ }
+
+ if mnt.Type == "bind" {
+ for _, o := range mnt.Options {
+ if ContainsStr(invalidOptions, o) {
+ return fmt.Errorf("mount option %q is not supported: %v", o, mnt)
+ }
+ _, ok1 := optionsMap[o]
+ _, ok2 := propOptionsMap[o]
+ if !ok1 && !ok2 {
+ return fmt.Errorf("unknown mount option %q", o)
+ }
+ }
+ }
+ return nil
+}
+
+// ValidateRootfsPropagation validates that rootfs propagation options are
+// correct.
+func validateRootfsPropagation(opt string) error {
+ flags := PropOptionsToFlags([]string{opt})
+ if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
+ return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
+ }
+ return nil
+}
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
new file mode 100644
index 000000000..7d194335c
--- /dev/null
+++ b/runsc/specutils/namespace.go
@@ -0,0 +1,222 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+ "fmt"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/syndtr/gocapability/capability"
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// nsCloneFlag returns the clone flag that can be used to set a namespace of
+// the given type.
+func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
+ switch nst {
+ case specs.IPCNamespace:
+ return syscall.CLONE_NEWIPC
+ case specs.MountNamespace:
+ return syscall.CLONE_NEWNS
+ case specs.NetworkNamespace:
+ return syscall.CLONE_NEWNET
+ case specs.PIDNamespace:
+ return syscall.CLONE_NEWPID
+ case specs.UTSNamespace:
+ return syscall.CLONE_NEWUTS
+ case specs.UserNamespace:
+ return syscall.CLONE_NEWUSER
+ case specs.CgroupNamespace:
+ panic("cgroup namespace has no associated clone flag")
+ default:
+ panic(fmt.Sprintf("unknown namespace %v", nst))
+ }
+}
+
+// nsPath returns the path of the namespace for the current process and the
+// given namespace.
+func nsPath(nst specs.LinuxNamespaceType) string {
+ base := "/proc/self/ns"
+ switch nst {
+ case specs.CgroupNamespace:
+ return filepath.Join(base, "cgroup")
+ case specs.IPCNamespace:
+ return filepath.Join(base, "ipc")
+ case specs.MountNamespace:
+ return filepath.Join(base, "mnt")
+ case specs.NetworkNamespace:
+ return filepath.Join(base, "net")
+ case specs.PIDNamespace:
+ return filepath.Join(base, "pid")
+ case specs.UserNamespace:
+ return filepath.Join(base, "user")
+ case specs.UTSNamespace:
+ return filepath.Join(base, "uts")
+ default:
+ panic(fmt.Sprintf("unknown namespace %v", nst))
+ }
+}
+
+// GetNS returns true and the namespace with the given type from the slice of
+// namespaces in the spec. It returns false if the slice does not contain a
+// namespace with the type.
+func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
+ if s.Linux == nil {
+ return specs.LinuxNamespace{}, false
+ }
+ for _, ns := range s.Linux.Namespaces {
+ if ns.Type == nst {
+ return ns, true
+ }
+ }
+ return specs.LinuxNamespace{}, false
+}
+
+// FilterNS returns a slice of namespaces from the spec with types that match
+// those in the `filter` slice.
+func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
+ if s.Linux == nil {
+ return nil
+ }
+ var out []specs.LinuxNamespace
+ for _, nst := range filter {
+ if ns, ok := GetNS(nst, s); ok {
+ out = append(out, ns)
+ }
+ }
+ return out
+}
+
+// setNS sets the namespace of the given type. It must be called with
+// OSThreadLocked.
+func setNS(fd, nsType uintptr) error {
+ if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
+ return err
+ }
+ return nil
+}
+
+// ApplyNS applies the namespace on the current thread and returns a function
+// that will restore the namespace to the original value.
+//
+// Preconditions: Must be called with os thread locked.
+func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
+ log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
+ newNS, err := os.Open(ns.Path)
+ if err != nil {
+ return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
+ }
+ defer newNS.Close()
+
+ // Store current namespace to restore back.
+ curPath := nsPath(ns.Type)
+ oldNS, err := os.Open(curPath)
+ if err != nil {
+ return nil, fmt.Errorf("error opening %q: %v", curPath, err)
+ }
+
+ // Set namespace to the one requested and setup function to restore it back.
+ flag := nsCloneFlag(ns.Type)
+ if err := setNS(newNS.Fd(), flag); err != nil {
+ oldNS.Close()
+ return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
+ }
+ return func() {
+ log.Infof("Restoring namespace %v", ns.Type)
+ defer oldNS.Close()
+ if err := setNS(oldNS.Fd(), flag); err != nil {
+ panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
+ }
+ }, nil
+}
+
+// StartInNS joins or creates the given namespaces and calls cmd.Start before
+// restoring the namespaces to the original values.
+func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
+ // We are about to setup namespaces, which requires the os thread being
+ // locked so that Go doesn't change the thread out from under us.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+
+ for _, ns := range nss {
+ if ns.Path == "" {
+ // No path. Just set a flag to create a new namespace.
+ cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
+ continue
+ }
+ // Join the given namespace, and restore the current namespace
+ // before exiting.
+ restoreNS, err := ApplyNS(ns)
+ if err != nil {
+ return err
+ }
+ defer restoreNS()
+ }
+
+ return cmd.Start()
+}
+
+// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
+func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
+ if s.Linux == nil {
+ return
+ }
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+ for _, idMap := range s.Linux.UIDMappings {
+ log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+ cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
+ ContainerID: int(idMap.ContainerID),
+ HostID: int(idMap.HostID),
+ Size: int(idMap.Size),
+ })
+ }
+ for _, idMap := range s.Linux.GIDMappings {
+ log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+ cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
+ ContainerID: int(idMap.ContainerID),
+ HostID: int(idMap.HostID),
+ Size: int(idMap.Size),
+ })
+ }
+}
+
+// HasCapabilities returns true if the user has all capabilties in 'cs'.
+func HasCapabilities(cs ...capability.Cap) bool {
+ caps, err := capability.NewPid2(os.Getpid())
+ if err != nil {
+ return false
+ }
+ if err := caps.Load(); err != nil {
+ return false
+ }
+ for _, c := range cs {
+ if !caps.Get(capability.EFFECTIVE, c) {
+ return false
+ }
+ }
+ return true
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
new file mode 100644
index 000000000..2888f55db
--- /dev/null
+++ b/runsc/specutils/specutils.go
@@ -0,0 +1,494 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package specutils contains utility functions for working with OCI runtime
+// specs.
+package specutils
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path"
+ "path/filepath"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/cenkalti/backoff"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// ExePath must point to runsc binary, which is normally the same binary. It's
+// changed in tests that aren't linked in the same binary.
+var ExePath = "/proc/self/exe"
+
+// Version is the supported spec version.
+var Version = specs.Version
+
+// LogSpec logs the spec in a human-friendly way.
+func LogSpec(spec *specs.Spec) {
+ log.Debugf("Spec: %+v", spec)
+ log.Debugf("Spec.Hooks: %+v", spec.Hooks)
+ log.Debugf("Spec.Linux: %+v", spec.Linux)
+ if spec.Linux != nil && spec.Linux.Resources != nil {
+ res := spec.Linux.Resources
+ log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory)
+ log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU)
+ log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO)
+ log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network)
+ }
+ log.Debugf("Spec.Process: %+v", spec.Process)
+ log.Debugf("Spec.Root: %+v", spec.Root)
+ log.Debugf("Spec.Mounts: %+v", spec.Mounts)
+}
+
+// ValidateSpec validates that the spec is compatible with runsc.
+func ValidateSpec(spec *specs.Spec) error {
+ // Mandatory fields.
+ if spec.Process == nil {
+ return fmt.Errorf("Spec.Process must be defined: %+v", spec)
+ }
+ if len(spec.Process.Args) == 0 {
+ return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process)
+ }
+ if spec.Root == nil {
+ return fmt.Errorf("Spec.Root must be defined: %+v", spec)
+ }
+ if len(spec.Root.Path) == 0 {
+ return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root)
+ }
+
+ // Unsupported fields.
+ if spec.Solaris != nil {
+ return fmt.Errorf("Spec.Solaris is not supported: %+v", spec)
+ }
+ if spec.Windows != nil {
+ return fmt.Errorf("Spec.Windows is not supported: %+v", spec)
+ }
+ if len(spec.Process.SelinuxLabel) != 0 {
+ return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
+ }
+
+ // Docker uses AppArmor by default, so just log that it's being ignored.
+ if spec.Process.ApparmorProfile != "" {
+ log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
+ }
+
+ // TODO(b/72226747): Apply seccomp to application inside sandbox.
+ if spec.Linux != nil && spec.Linux.Seccomp != nil {
+ log.Warningf("Seccomp spec is being ignored")
+ }
+
+ if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
+ if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
+ return err
+ }
+ }
+ for _, m := range spec.Mounts {
+ if err := validateMount(&m); err != nil {
+ return err
+ }
+ }
+
+ // Two annotations are use by containerd to support multi-container pods.
+ // "io.kubernetes.cri.container-type"
+ // "io.kubernetes.cri.sandbox-id"
+ containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation]
+ _, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation]
+ switch {
+ // Non-containerd use won't set a container type.
+ case !hasContainerType:
+ case containerType == ContainerdContainerTypeSandbox:
+ // When starting a container in an existing sandbox, the sandbox ID
+ // must be set.
+ case containerType == ContainerdContainerTypeContainer:
+ if !hasSandboxID {
+ return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType)
+ }
+ default:
+ return fmt.Errorf("unknown container-type: %s", containerType)
+ }
+
+ return nil
+}
+
+// absPath turns the given path into an absolute path (if it is not already
+// absolute) by prepending the base path.
+func absPath(base, rel string) string {
+ if filepath.IsAbs(rel) {
+ return rel
+ }
+ return filepath.Join(base, rel)
+}
+
+// OpenSpec opens an OCI runtime spec from the given bundle directory.
+func OpenSpec(bundleDir string) (*os.File, error) {
+ // The spec file must be named "config.json" inside the bundle directory.
+ return os.Open(filepath.Join(bundleDir, "config.json"))
+}
+
+// ReadSpec reads an OCI runtime spec from the given bundle directory.
+// ReadSpec also normalizes all potential relative paths into absolute
+// path, e.g. spec.Root.Path, mount.Source.
+func ReadSpec(bundleDir string) (*specs.Spec, error) {
+ specFile, err := OpenSpec(bundleDir)
+ if err != nil {
+ return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
+ }
+ defer specFile.Close()
+ return ReadSpecFromFile(bundleDir, specFile)
+}
+
+// ReadSpecFromFile reads an OCI runtime spec from the given File, and
+// normalizes all relative paths into absolute by prepending the bundle dir.
+func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) {
+ if _, err := specFile.Seek(0, os.SEEK_SET); err != nil {
+ return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
+ }
+ specBytes, err := ioutil.ReadAll(specFile)
+ if err != nil {
+ return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err)
+ }
+ var spec specs.Spec
+ if err := json.Unmarshal(specBytes, &spec); err != nil {
+ return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes))
+ }
+ if err := ValidateSpec(&spec); err != nil {
+ return nil, err
+ }
+ // Turn any relative paths in the spec to absolute by prepending the bundleDir.
+ spec.Root.Path = absPath(bundleDir, spec.Root.Path)
+ for i := range spec.Mounts {
+ m := &spec.Mounts[i]
+ if m.Source != "" {
+ m.Source = absPath(bundleDir, m.Source)
+ }
+ }
+ return &spec, nil
+}
+
+// ReadMounts reads mount list from a file.
+func ReadMounts(f *os.File) ([]specs.Mount, error) {
+ bytes, err := ioutil.ReadAll(f)
+ if err != nil {
+ return nil, fmt.Errorf("error reading mounts: %v", err)
+ }
+ var mounts []specs.Mount
+ if err := json.Unmarshal(bytes, &mounts); err != nil {
+ return nil, fmt.Errorf("error unmarshaling mounts: %v\n %s", err, string(bytes))
+ }
+ return mounts, nil
+}
+
+// Capabilities takes in spec and returns a TaskCapabilities corresponding to
+// the spec.
+func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+ // Strip CAP_NET_RAW from all capability sets if necessary.
+ skipSet := map[linux.Capability]struct{}{}
+ if !enableRaw {
+ skipSet[linux.CAP_NET_RAW] = struct{}{}
+ }
+
+ var caps auth.TaskCapabilities
+ if specCaps != nil {
+ var err error
+ if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil {
+ return nil, err
+ }
+ if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil {
+ return nil, err
+ }
+ if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil {
+ return nil, err
+ }
+ if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
+ return nil, err
+ }
+ // TODO(nlacasse): Support ambient capabilities.
+ }
+ return &caps, nil
+}
+
+// AllCapabilities returns a LinuxCapabilities struct with all capabilities.
+func AllCapabilities() *specs.LinuxCapabilities {
+ var names []string
+ for n := range capFromName {
+ names = append(names, n)
+ }
+ return &specs.LinuxCapabilities{
+ Bounding: names,
+ Effective: names,
+ Inheritable: names,
+ Permitted: names,
+ Ambient: names,
+ }
+}
+
+var capFromName = map[string]linux.Capability{
+ "CAP_CHOWN": linux.CAP_CHOWN,
+ "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE,
+ "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH,
+ "CAP_FOWNER": linux.CAP_FOWNER,
+ "CAP_FSETID": linux.CAP_FSETID,
+ "CAP_KILL": linux.CAP_KILL,
+ "CAP_SETGID": linux.CAP_SETGID,
+ "CAP_SETUID": linux.CAP_SETUID,
+ "CAP_SETPCAP": linux.CAP_SETPCAP,
+ "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE,
+ "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE,
+ "CAP_NET_BROADCAST": linux.CAP_NET_BROADCAST,
+ "CAP_NET_ADMIN": linux.CAP_NET_ADMIN,
+ "CAP_NET_RAW": linux.CAP_NET_RAW,
+ "CAP_IPC_LOCK": linux.CAP_IPC_LOCK,
+ "CAP_IPC_OWNER": linux.CAP_IPC_OWNER,
+ "CAP_SYS_MODULE": linux.CAP_SYS_MODULE,
+ "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO,
+ "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT,
+ "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE,
+ "CAP_SYS_PACCT": linux.CAP_SYS_PACCT,
+ "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN,
+ "CAP_SYS_BOOT": linux.CAP_SYS_BOOT,
+ "CAP_SYS_NICE": linux.CAP_SYS_NICE,
+ "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE,
+ "CAP_SYS_TIME": linux.CAP_SYS_TIME,
+ "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG,
+ "CAP_MKNOD": linux.CAP_MKNOD,
+ "CAP_LEASE": linux.CAP_LEASE,
+ "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE,
+ "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL,
+ "CAP_SETFCAP": linux.CAP_SETFCAP,
+ "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE,
+ "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN,
+ "CAP_SYSLOG": linux.CAP_SYSLOG,
+ "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM,
+ "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND,
+ "CAP_AUDIT_READ": linux.CAP_AUDIT_READ,
+}
+
+func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) {
+ var caps []linux.Capability
+ for _, n := range names {
+ c, ok := capFromName[n]
+ if !ok {
+ return 0, fmt.Errorf("unknown capability %q", n)
+ }
+ // Should we skip this capabilty?
+ if _, ok := skipSet[c]; ok {
+ continue
+ }
+ caps = append(caps, c)
+ }
+ return auth.CapabilitySetOfMany(caps), nil
+}
+
+// Is9PMount returns true if the given mount can be mounted as an external gofer.
+func Is9PMount(m specs.Mount) bool {
+ return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m)
+}
+
+// IsSupportedDevMount returns true if the mount is a supported /dev mount.
+// Only mount that does not conflict with runsc default /dev mount is
+// supported.
+func IsSupportedDevMount(m specs.Mount) bool {
+ // These are devices exist inside sentry. See pkg/sentry/fs/dev/dev.go
+ var existingDevices = []string{
+ "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr",
+ "/dev/null", "/dev/zero", "/dev/full", "/dev/random",
+ "/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx",
+ }
+ dst := filepath.Clean(m.Destination)
+ if dst == "/dev" {
+ // OCI spec uses many different mounts for the things inside of '/dev'. We
+ // have a single mount at '/dev' that is always mounted, regardless of
+ // whether it was asked for, as the spec says we SHOULD.
+ return false
+ }
+ for _, dev := range existingDevices {
+ if dst == dev || strings.HasPrefix(dst, dev+"/") {
+ return false
+ }
+ }
+ return true
+}
+
+const (
+ // ContainerdContainerTypeAnnotation is the OCI annotation set by
+ // containerd to indicate whether the container to create should have
+ // its own sandbox or a container within an existing sandbox.
+ ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+ // ContainerdContainerTypeContainer is the container type value
+ // indicating the container should be created in an existing sandbox.
+ ContainerdContainerTypeContainer = "container"
+ // ContainerdContainerTypeSandbox is the container type value
+ // indicating the container should be created in a new sandbox.
+ ContainerdContainerTypeSandbox = "sandbox"
+
+ // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
+ // which sandbox the container should be created in when the container
+ // is not the first container in the sandbox.
+ ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
+)
+
+// ShouldCreateSandbox returns true if the spec indicates that a new sandbox
+// should be created for the container. If false, the container should be
+// started in an existing sandbox.
+func ShouldCreateSandbox(spec *specs.Spec) bool {
+ t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]
+ return !ok || t == ContainerdContainerTypeSandbox
+}
+
+// SandboxID returns the ID of the sandbox to join and whether an ID was found
+// in the spec.
+func SandboxID(spec *specs.Spec) (string, bool) {
+ id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]
+ return id, ok
+}
+
+// WaitForReady waits for a process to become ready. The process is ready when
+// the 'ready' function returns true. It continues to wait if 'ready' returns
+// false. It returns error on timeout, if the process stops or if 'ready' fails.
+func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error {
+ b := backoff.NewExponentialBackOff()
+ b.InitialInterval = 1 * time.Millisecond
+ b.MaxInterval = 1 * time.Second
+ b.MaxElapsedTime = timeout
+
+ op := func() error {
+ if ok, err := ready(); err != nil {
+ return backoff.Permanent(err)
+ } else if ok {
+ return nil
+ }
+
+ // Check if the process is still running.
+ // If the process is alive, child is 0 because of the NOHANG option.
+ // If the process has terminated, child equals the process id.
+ var ws syscall.WaitStatus
+ var ru syscall.Rusage
+ child, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, &ru)
+ if err != nil {
+ return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err))
+ } else if child == pid {
+ return backoff.Permanent(fmt.Errorf("process %d has terminated", pid))
+ }
+ return fmt.Errorf("process %d not running yet", pid)
+ }
+ return backoff.Retry(op, b)
+}
+
+// DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern'
+// ends with '/', it's used as a directory with default file name.
+// 'logPattern' can contain variables that are substitued:
+// - %TIMESTAMP%: is replaced with a timestamp using the following format:
+// <yyyymmdd-hhmmss.uuuuuu>
+// - %COMMAND%: is replaced with 'command'
+func DebugLogFile(logPattern, command string) (*os.File, error) {
+ if strings.HasSuffix(logPattern, "/") {
+ // Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
+ logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%"
+ }
+ logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1)
+ logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1)
+
+ dir := filepath.Dir(logPattern)
+ if err := os.MkdirAll(dir, 0775); err != nil {
+ return nil, fmt.Errorf("error creating dir %q: %v", dir, err)
+ }
+ return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+}
+
+// Mount creates the mount point and calls Mount with the given flags.
+func Mount(src, dst, typ string, flags uint32) error {
+ // Create the mount point inside. The type must be the same as the
+ // source (file or directory).
+ var isDir bool
+ if typ == "proc" {
+ // Special case, as there is no source directory for proc mounts.
+ isDir = true
+ } else if fi, err := os.Stat(src); err != nil {
+ return fmt.Errorf("Stat(%q) failed: %v", src, err)
+ } else {
+ isDir = fi.IsDir()
+ }
+
+ if isDir {
+ // Create the destination directory.
+ if err := os.MkdirAll(dst, 0777); err != nil {
+ return fmt.Errorf("Mkdir(%q) failed: %v", dst, err)
+ }
+ } else {
+ // Create the parent destination directory.
+ parent := path.Dir(dst)
+ if err := os.MkdirAll(parent, 0777); err != nil {
+ return fmt.Errorf("Mkdir(%q) failed: %v", parent, err)
+ }
+ // Create the destination file if it does not exist.
+ f, err := os.OpenFile(dst, syscall.O_CREAT, 0777)
+ if err != nil {
+ return fmt.Errorf("Open(%q) failed: %v", dst, err)
+ }
+ f.Close()
+ }
+
+ // Do the mount.
+ if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil {
+ return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err)
+ }
+ return nil
+}
+
+// ContainsStr returns true if 'str' is inside 'strs'.
+func ContainsStr(strs []string, str string) bool {
+ for _, s := range strs {
+ if s == str {
+ return true
+ }
+ }
+ return false
+}
+
+// Cleanup allows defers to be aborted when cleanup needs to happen
+// conditionally. Usage:
+// c := MakeCleanup(func() { f.Close() })
+// defer c.Clean() // any failure before release is called will close the file.
+// ...
+// c.Release() // on success, aborts closing the file and return it.
+// return f
+type Cleanup struct {
+ clean func()
+}
+
+// MakeCleanup creates a new Cleanup object.
+func MakeCleanup(f func()) Cleanup {
+ return Cleanup{clean: f}
+}
+
+// Clean calls the cleanup function.
+func (c *Cleanup) Clean() {
+ if c.clean != nil {
+ c.clean()
+ c.clean = nil
+ }
+}
+
+// Release releases the cleanup from its duties, i.e. cleanup function is not
+// called after this point.
+func (c *Cleanup) Release() {
+ c.clean = nil
+}
diff --git a/runsc/version.go b/runsc/version.go
new file mode 100644
index 000000000..ce0573a9b
--- /dev/null
+++ b/runsc/version.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+// version is set during linking.
+var version = ""