summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot')
-rw-r--r--runsc/boot/compat.go159
-rw-r--r--runsc/boot/compat_amd64.go77
-rw-r--r--runsc/boot/config.go253
-rw-r--r--runsc/boot/controller.go491
-rw-r--r--runsc/boot/debug.go29
-rw-r--r--runsc/boot/events.go81
-rw-r--r--runsc/boot/fds.go89
-rw-r--r--runsc/boot/filter/config.go493
-rw-r--r--runsc/boot/filter/extra_filters.go28
-rw-r--r--runsc/boot/filter/extra_filters_msan.go32
-rw-r--r--runsc/boot/filter/extra_filters_race.go40
-rw-r--r--runsc/boot/filter/filter.go71
-rw-r--r--runsc/boot/fs.go774
-rw-r--r--runsc/boot/limits.go154
-rw-r--r--runsc/boot/loader.go954
-rw-r--r--runsc/boot/network.go222
-rw-r--r--runsc/boot/strace.go40
17 files changed, 3987 insertions, 0 deletions
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
new file mode 100644
index 000000000..c369e4d64
--- /dev/null
+++ b/runsc/boot/compat.go
@@ -0,0 +1,159 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "os"
+ "sync"
+ "syscall"
+
+ "github.com/golang/protobuf/proto"
+ "gvisor.googlesource.com/gvisor/pkg/abi"
+ "gvisor.googlesource.com/gvisor/pkg/eventchannel"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+ ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+ spb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+)
+
+func initCompatLogs(fd int) error {
+ ce, err := newCompatEmitter(fd)
+ if err != nil {
+ return err
+ }
+ eventchannel.AddEmitter(ce)
+ return nil
+}
+
+type compatEmitter struct {
+ sink *log.BasicLogger
+ nameMap strace.SyscallMap
+
+ // mu protects the fields below.
+ mu sync.Mutex
+
+ // trackers map syscall number to the respective tracker instance.
+ // Protected by 'mu'.
+ trackers map[uint64]syscallTracker
+}
+
+func newCompatEmitter(logFD int) (*compatEmitter, error) {
+ nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+ if !ok {
+ return nil, fmt.Errorf("amd64 Linux syscall table not found")
+ }
+
+ c := &compatEmitter{
+ // Always logs to default logger.
+ sink: log.Log(),
+ nameMap: nameMap,
+ trackers: make(map[uint64]syscallTracker),
+ }
+
+ if logFD > 0 {
+ f := os.NewFile(uintptr(logFD), "user log file")
+ target := log.MultiEmitter{c.sink, log.K8sJSONEmitter{log.Writer{Next: f}}}
+ c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
+ }
+ return c, nil
+}
+
+// Emit implements eventchannel.Emitter.
+func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
+ switch m := msg.(type) {
+ case *spb.UnimplementedSyscall:
+ c.emitUnimplementedSyscall(m)
+ case *ucspb.UncaughtSignal:
+ c.emitUncaughtSignal(m)
+ }
+
+ return false, nil
+}
+
+func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
+ regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+
+ c.mu.Lock()
+ defer c.mu.Unlock()
+
+ sysnr := regs.OrigRax
+ tr := c.trackers[sysnr]
+ if tr == nil {
+ switch sysnr {
+ case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+ // args: cmd, ...
+ tr = newArgsTracker(0)
+
+ case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX, syscall.SYS_FALLOCATE:
+ // args: fd/addr, cmd, ...
+ tr = newArgsTracker(1)
+
+ case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
+ // args: fd, level, name, ...
+ tr = newArgsTracker(1, 2)
+
+ case syscall.SYS_SEMCTL:
+ // args: semid, semnum, cmd, ...
+ tr = newArgsTracker(2)
+
+ default:
+ tr = &onceTracker{}
+ }
+ c.trackers[sysnr] = tr
+ }
+ if tr.shouldReport(regs) {
+ c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+ tr.onReported(regs)
+ }
+}
+
+func (c *compatEmitter) emitUncaughtSignal(msg *ucspb.UncaughtSignal) {
+ sig := syscall.Signal(msg.SignalNumber)
+ c.sink.Infof(
+ "Uncaught signal: %q (%d), PID: %d, TID: %d, fault addr: %#x",
+ sig, msg.SignalNumber, msg.Pid, msg.Tid, msg.FaultAddr)
+}
+
+// Close implements eventchannel.Emitter.
+func (c *compatEmitter) Close() error {
+ c.sink = nil
+ return nil
+}
+
+// syscallTracker interface allows filters to apply differently depending on
+// the syscall and arguments.
+type syscallTracker interface {
+ // shouldReport returns true is the syscall should be reported.
+ shouldReport(regs *rpb.AMD64Registers) bool
+
+ // onReported marks the syscall as reported.
+ onReported(regs *rpb.AMD64Registers)
+}
+
+// onceTracker reports only a single time, used for most syscalls.
+type onceTracker struct {
+ reported bool
+}
+
+func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+ return !o.reported
+}
+
+func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+ o.reported = true
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
new file mode 100644
index 000000000..99df5e614
--- /dev/null
+++ b/runsc/boot/compat_amd64.go
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+
+ rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+ // argsIdx is the syscall arguments to use as unique ID.
+ argsIdx []int
+ reported map[string]struct{}
+ count int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+ return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// cmd returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
+ var rv string
+ for _, idx := range a.argsIdx {
+ rv += fmt.Sprintf("%d|", argVal(idx, regs))
+ }
+ return rv
+}
+
+func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+ switch argIdx {
+ case 0:
+ return uint32(regs.Rdi)
+ case 1:
+ return uint32(regs.Rsi)
+ case 2:
+ return uint32(regs.Rdx)
+ case 3:
+ return uint32(regs.R10)
+ case 4:
+ return uint32(regs.R8)
+ case 5:
+ return uint32(regs.R9)
+ }
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
+ if a.count >= reportLimit {
+ return false
+ }
+ _, ok := a.reported[a.key(regs)]
+ return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
+ a.count++
+ a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
new file mode 100644
index 000000000..15f624f9b
--- /dev/null
+++ b/runsc/boot/config.go
@@ -0,0 +1,253 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+)
+
+// PlatformType tells which platform to use.
+type PlatformType int
+
+const (
+ // PlatformPtrace runs the sandbox with the ptrace platform.
+ PlatformPtrace PlatformType = iota
+
+ // PlatformKVM runs the sandbox with the KVM platform.
+ PlatformKVM
+)
+
+// MakePlatformType converts type from string.
+func MakePlatformType(s string) (PlatformType, error) {
+ switch s {
+ case "ptrace":
+ return PlatformPtrace, nil
+ case "kvm":
+ return PlatformKVM, nil
+ default:
+ return 0, fmt.Errorf("invalid platform type %q", s)
+ }
+}
+
+func (p PlatformType) String() string {
+ switch p {
+ case PlatformPtrace:
+ return "ptrace"
+ case PlatformKVM:
+ return "kvm"
+ default:
+ return fmt.Sprintf("unknown(%d)", p)
+ }
+}
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+ // FileAccessShared sends IO requests to a Gofer process that validates the
+ // requests and forwards them to the host.
+ FileAccessShared FileAccessType = iota
+
+ // FileAccessExclusive is the same as FileAccessShared, but enables
+ // extra caching for improved performance. It should only be used if
+ // the sandbox has exclusive access to the filesystem.
+ FileAccessExclusive
+)
+
+// MakeFileAccessType converts type from string.
+func MakeFileAccessType(s string) (FileAccessType, error) {
+ switch s {
+ case "shared":
+ return FileAccessShared, nil
+ case "exclusive":
+ return FileAccessExclusive, nil
+ default:
+ return 0, fmt.Errorf("invalid file access type %q", s)
+ }
+}
+
+func (f FileAccessType) String() string {
+ switch f {
+ case FileAccessShared:
+ return "shared"
+ case FileAccessExclusive:
+ return "exclusive"
+ default:
+ return fmt.Sprintf("unknown(%d)", f)
+ }
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+ // NetworkSandbox uses internal network stack, isolated from the host.
+ NetworkSandbox NetworkType = iota
+
+ // NetworkHost redirects network related syscalls to the host network.
+ NetworkHost
+
+ // NetworkNone sets up just loopback using netstack.
+ NetworkNone
+)
+
+// MakeNetworkType converts type from string.
+func MakeNetworkType(s string) (NetworkType, error) {
+ switch s {
+ case "sandbox":
+ return NetworkSandbox, nil
+ case "host":
+ return NetworkHost, nil
+ case "none":
+ return NetworkNone, nil
+ default:
+ return 0, fmt.Errorf("invalid network type %q", s)
+ }
+}
+
+func (n NetworkType) String() string {
+ switch n {
+ case NetworkSandbox:
+ return "sandbox"
+ case NetworkHost:
+ return "host"
+ case NetworkNone:
+ return "none"
+ default:
+ return fmt.Sprintf("unknown(%d)", n)
+ }
+}
+
+// MakeWatchdogAction converts type from string.
+func MakeWatchdogAction(s string) (watchdog.Action, error) {
+ switch strings.ToLower(s) {
+ case "log", "logwarning":
+ return watchdog.LogWarning, nil
+ case "panic":
+ return watchdog.Panic, nil
+ default:
+ return 0, fmt.Errorf("invalid watchdog action %q", s)
+ }
+}
+
+// Config holds configuration that is not part of the runtime spec.
+type Config struct {
+ // RootDir is the runtime root directory.
+ RootDir string
+
+ // Debug indicates that debug logging should be enabled.
+ Debug bool
+
+ // LogFilename is the filename to log to, if not empty.
+ LogFilename string
+
+ // LogFormat is the log format.
+ LogFormat string
+
+ // DebugLog is the path to log debug information to, if not empty.
+ DebugLog string
+
+ // DebugLogFormat is the log format for debug.
+ DebugLogFormat string
+
+ // FileAccess indicates how the filesystem is accessed.
+ FileAccess FileAccessType
+
+ // Overlay is whether to wrap the root filesystem in an overlay.
+ Overlay bool
+
+ // Network indicates what type of network to use.
+ Network NetworkType
+
+ // EnableRaw indicates whether raw sockets should be enabled. Raw
+ // sockets are disabled by stripping CAP_NET_RAW from the list of
+ // capabilities.
+ EnableRaw bool
+
+ // GSO indicates that generic segmentation offload is enabled.
+ GSO bool
+
+ // LogPackets indicates that all network packets should be logged.
+ LogPackets bool
+
+ // Platform is the platform to run on.
+ Platform PlatformType
+
+ // Strace indicates that strace should be enabled.
+ Strace bool
+
+ // StraceSyscalls is the set of syscalls to trace. If StraceEnable is
+ // true and this list is empty, then all syscalls will be traced.
+ StraceSyscalls []string
+
+ // StraceLogSize is the max size of data blobs to display.
+ StraceLogSize uint
+
+ // DisableSeccomp indicates whether seccomp syscall filters should be
+ // disabled. Pardon the double negation, but default to enabled is important.
+ DisableSeccomp bool
+
+ // WatchdogAction sets what action the watchdog takes when triggered.
+ WatchdogAction watchdog.Action
+
+ // PanicSignal registers signal handling that panics. Usually set to
+ // SIGUSR2(12) to troubleshoot hangs. -1 disables it.
+ PanicSignal int
+
+ // ProfileEnable is set to prepare the sandbox to be profiled.
+ ProfileEnable bool
+
+ // RestoreFile is the path to the saved container image
+ RestoreFile string
+
+ // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+ // tests. It allows runsc to start the sandbox process as the current
+ // user, and without chrooting the sandbox process. This can be
+ // necessary in test environments that have limited capabilities.
+ TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+ f := []string{
+ "--root=" + c.RootDir,
+ "--debug=" + strconv.FormatBool(c.Debug),
+ "--log=" + c.LogFilename,
+ "--log-format=" + c.LogFormat,
+ "--debug-log=" + c.DebugLog,
+ "--debug-log-format=" + c.DebugLogFormat,
+ "--file-access=" + c.FileAccess.String(),
+ "--overlay=" + strconv.FormatBool(c.Overlay),
+ "--network=" + c.Network.String(),
+ "--log-packets=" + strconv.FormatBool(c.LogPackets),
+ "--platform=" + c.Platform.String(),
+ "--strace=" + strconv.FormatBool(c.Strace),
+ "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
+ "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
+ "--watchdog-action=" + c.WatchdogAction.String(),
+ "--panic-signal=" + strconv.Itoa(c.PanicSignal),
+ "--profile=" + strconv.FormatBool(c.ProfileEnable),
+ "--net-raw=" + strconv.FormatBool(c.EnableRaw),
+ }
+ if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+ // Only include if set since it is never to be used by users.
+ f = append(f, "-TESTONLY-unsafe-nonroot=true")
+ }
+ return f
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
new file mode 100644
index 000000000..72ab9ef86
--- /dev/null
+++ b/runsc/boot/controller.go
@@ -0,0 +1,491 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "path"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/control/server"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/state"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+const (
+ // ContainerCheckpoint checkpoints a container.
+ ContainerCheckpoint = "containerManager.Checkpoint"
+
+ // ContainerCreate creates a container.
+ ContainerCreate = "containerManager.Create"
+
+ // ContainerDestroy is used to stop a non-root container and free all
+ // associated resources in the sandbox.
+ ContainerDestroy = "containerManager.Destroy"
+
+ // ContainerEvent is the URPC endpoint for getting stats about the
+ // container used by "runsc events".
+ ContainerEvent = "containerManager.Event"
+
+ // ContainerExecuteAsync is the URPC endpoint for executing a command in a
+ // container..
+ ContainerExecuteAsync = "containerManager.ExecuteAsync"
+
+ // ContainerPause pauses the container.
+ ContainerPause = "containerManager.Pause"
+
+ // ContainerProcesses is the URPC endpoint for getting the list of
+ // processes running in a container.
+ ContainerProcesses = "containerManager.Processes"
+
+ // ContainerRestore restores a container from a statefile.
+ ContainerRestore = "containerManager.Restore"
+
+ // ContainerResume unpauses the paused container.
+ ContainerResume = "containerManager.Resume"
+
+ // ContainerSignal is used to send a signal to a container.
+ ContainerSignal = "containerManager.Signal"
+
+ // ContainerSignalProcess is used to send a signal to a particular
+ // process in a container.
+ ContainerSignalProcess = "containerManager.SignalProcess"
+
+ // ContainerStart is the URPC endpoint for running a non-root container
+ // within a sandbox.
+ ContainerStart = "containerManager.Start"
+
+ // ContainerWait is used to wait on the init process of the container
+ // and return its ExitStatus.
+ ContainerWait = "containerManager.Wait"
+
+ // ContainerWaitPID is used to wait on a process with a certain PID in
+ // the sandbox and return its ExitStatus.
+ ContainerWaitPID = "containerManager.WaitPID"
+
+ // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
+ // and routes in a network stack.
+ NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
+
+ // RootContainerStart is the URPC endpoint for starting a new sandbox
+ // with root container.
+ RootContainerStart = "containerManager.StartRoot"
+
+ // SandboxStacks collects sandbox stacks for debugging.
+ SandboxStacks = "debug.Stacks"
+
+ // Profiling related commands (see pprof.go for more details).
+ StartCPUProfile = "Profile.StartCPUProfile"
+ StopCPUProfile = "Profile.StopCPUProfile"
+ HeapProfile = "Profile.HeapProfile"
+ StartTrace = "Profile.StartTrace"
+ StopTrace = "Profile.StopTrace"
+)
+
+// ControlSocketAddr generates an abstract unix socket name for the given ID.
+func ControlSocketAddr(id string) string {
+ return fmt.Sprintf("\x00runsc-sandbox.%s", id)
+}
+
+// controller holds the control server, and is used for communication into the
+// sandbox.
+type controller struct {
+ // srv is the control server.
+ srv *server.Server
+
+ // manager holds the containerManager methods.
+ manager *containerManager
+}
+
+// newController creates a new controller. The caller must call
+// controller.srv.StartServing() to start the controller.
+func newController(fd int, l *Loader) (*controller, error) {
+ srv, err := server.CreateFromFD(fd)
+ if err != nil {
+ return nil, err
+ }
+
+ manager := &containerManager{
+ startChan: make(chan struct{}),
+ startResultChan: make(chan error),
+ l: l,
+ }
+ srv.Register(manager)
+
+ if eps, ok := l.k.NetworkStack().(*epsocket.Stack); ok {
+ net := &Network{
+ Stack: eps.Stack,
+ }
+ srv.Register(net)
+ }
+
+ srv.Register(&debug{})
+ if l.conf.ProfileEnable {
+ srv.Register(&control.Profile{})
+ }
+
+ return &controller{
+ srv: srv,
+ manager: manager,
+ }, nil
+}
+
+// containerManager manages sandboes containers.
+type containerManager struct {
+ // startChan is used to signal when the root container process should
+ // be started.
+ startChan chan struct{}
+
+ // startResultChan is used to signal when the root container has
+ // started. Any errors encountered during startup will be sent to the
+ // channel. A nil value indicates success.
+ startResultChan chan error
+
+ // l is the loader that creates containers and sandboxes.
+ l *Loader
+}
+
+// StartRoot will start the root container process.
+func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
+ log.Debugf("containerManager.StartRoot %q", *cid)
+ // Tell the root container to start and wait for the result.
+ cm.startChan <- struct{}{}
+ if err := <-cm.startResultChan; err != nil {
+ return fmt.Errorf("starting sandbox: %v", err)
+ }
+ return nil
+}
+
+// Processes retrieves information about processes running in the sandbox.
+func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error {
+ log.Debugf("containerManager.Processes: %q", *cid)
+ return control.Processes(cm.l.k, *cid, out)
+}
+
+// Create creates a container within a sandbox.
+func (cm *containerManager) Create(cid *string, _ *struct{}) error {
+ log.Debugf("containerManager.Create: %q", *cid)
+ return cm.l.createContainer(*cid)
+}
+
+// StartArgs contains arguments to the Start method.
+type StartArgs struct {
+ // Spec is the spec of the container to start.
+ Spec *specs.Spec
+
+ // Config is the runsc-specific configuration for the sandbox.
+ Conf *Config
+
+ // CID is the ID of the container to start.
+ CID string
+
+ // FilePayload contains, in order:
+ // * stdin, stdout, and stderr.
+ // * the file descriptor over which the sandbox will
+ // request files from its root filesystem.
+ urpc.FilePayload
+}
+
+// Start runs a created container within a sandbox.
+func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
+ log.Debugf("containerManager.Start: %+v", args)
+
+ // Validate arguments.
+ if args == nil {
+ return errors.New("start missing arguments")
+ }
+ if args.Spec == nil {
+ return errors.New("start arguments missing spec")
+ }
+ if args.Conf == nil {
+ return errors.New("start arguments missing config")
+ }
+ if args.CID == "" {
+ return errors.New("start argument missing container ID")
+ }
+ // Prevent CIDs containing ".." from confusing the sentry when creating
+ // /containers/<cid> directory.
+ // TODO(b/129293409): Once we have multiple independent roots, this
+ // check won't be necessary.
+ if path.Clean(args.CID) != args.CID {
+ return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
+ }
+ if len(args.FilePayload.Files) < 4 {
+ return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
+ }
+
+ err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+ if err != nil {
+ log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
+ return err
+ }
+ log.Debugf("Container %q started", args.CID)
+
+ return nil
+}
+
+// Destroy stops a container if it is still running and cleans up its
+// filesystem.
+func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
+ log.Debugf("containerManager.destroy %q", *cid)
+ return cm.l.destroyContainer(*cid)
+}
+
+// ExecuteAsync starts running a command on a created or running sandbox. It
+// returns the PID of the new process.
+func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
+ log.Debugf("containerManager.ExecuteAsync: %+v", args)
+ tgid, err := cm.l.executeAsync(args)
+ if err != nil {
+ log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err)
+ return err
+ }
+ *pid = int32(tgid)
+ return nil
+}
+
+// Checkpoint pauses a sandbox and saves its state.
+func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
+ log.Debugf("containerManager.Checkpoint")
+ state := control.State{
+ Kernel: cm.l.k,
+ Watchdog: cm.l.watchdog,
+ }
+ return state.Save(o, nil)
+}
+
+// Pause suspends a container.
+func (cm *containerManager) Pause(_, _ *struct{}) error {
+ log.Debugf("containerManager.Pause")
+ cm.l.k.Pause()
+ return nil
+}
+
+// RestoreOpts contains options related to restoring a container's file system.
+type RestoreOpts struct {
+ // FilePayload contains the state file to be restored, followed by the
+ // platform device file if necessary.
+ urpc.FilePayload
+
+ // SandboxID contains the ID of the sandbox.
+ SandboxID string
+}
+
+// Restore loads a container from a statefile.
+// The container's current kernel is destroyed, a restore environment is
+// created, and the kernel is recreated with the restore state file. The
+// container then sends the signal to start.
+func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
+ log.Debugf("containerManager.Restore")
+
+ var specFile, deviceFile *os.File
+ switch numFiles := len(o.FilePayload.Files); numFiles {
+ case 2:
+ // The device file is donated to the platform.
+ // Can't take ownership away from os.File. dup them to get a new FD.
+ fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+ if err != nil {
+ return fmt.Errorf("failed to dup file: %v", err)
+ }
+ deviceFile = os.NewFile(uintptr(fd), "platform device")
+ fallthrough
+ case 1:
+ specFile = o.FilePayload.Files[0]
+ case 0:
+ return fmt.Errorf("at least one file must be passed to Restore")
+ default:
+ return fmt.Errorf("at most two files may be passed to Restore")
+ }
+
+ networkStack := cm.l.k.NetworkStack()
+ // Destroy the old kernel and create a new kernel.
+ cm.l.k.Pause()
+ cm.l.k.Destroy()
+
+ p, err := createPlatform(cm.l.conf, deviceFile)
+ if err != nil {
+ return fmt.Errorf("creating platform: %v", err)
+ }
+ k := &kernel.Kernel{
+ Platform: p,
+ }
+ mf, err := createMemoryFile()
+ if err != nil {
+ return fmt.Errorf("creating memory file: %v", err)
+ }
+ k.SetMemoryFile(mf)
+ cm.l.k = k
+
+ // Set up the restore environment.
+ fds := &fdDispenser{fds: cm.l.goferFDs}
+ renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
+ if err != nil {
+ return fmt.Errorf("creating RestoreEnvironment: %v", err)
+ }
+ fs.SetRestoreEnvironment(*renv)
+
+ // Prepare to load from the state file.
+ if eps, ok := networkStack.(*epsocket.Stack); ok {
+ stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
+ }
+ info, err := specFile.Stat()
+ if err != nil {
+ return err
+ }
+ if info.Size() == 0 {
+ return fmt.Errorf("file cannot be empty")
+ }
+
+ // Load the state.
+ loadOpts := state.LoadOpts{Source: specFile}
+ if err := loadOpts.Load(k, networkStack); err != nil {
+ return err
+ }
+
+ // Set timekeeper.
+ k.Timekeeper().SetClocks(time.NewCalibratedClocks())
+
+ // Since we have a new kernel we also must make a new watchdog.
+ watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+
+ // Change the loader fields to reflect the changes made when restoring.
+ cm.l.k = k
+ cm.l.watchdog = watchdog
+ cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+ cm.l.restore = true
+
+ // Reinitialize the sandbox ID and processes map. Note that it doesn't
+ // restore the state of multiple containers, nor exec processes.
+ cm.l.sandboxID = o.SandboxID
+ cm.l.mu.Lock()
+ eid := execID{cid: o.SandboxID}
+ cm.l.processes = map[execID]*execProcess{
+ eid: {
+ tg: cm.l.k.GlobalInit(),
+ },
+ }
+ cm.l.mu.Unlock()
+
+ // Tell the root container to start and wait for the result.
+ cm.startChan <- struct{}{}
+ if err := <-cm.startResultChan; err != nil {
+ return fmt.Errorf("starting sandbox: %v", err)
+ }
+
+ return nil
+}
+
+// Resume unpauses a container.
+func (cm *containerManager) Resume(_, _ *struct{}) error {
+ log.Debugf("containerManager.Resume")
+ cm.l.k.Unpause()
+ return nil
+}
+
+// Wait waits for the init process in the given container.
+func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
+ log.Debugf("containerManager.Wait")
+ err := cm.l.waitContainer(*cid, waitStatus)
+ log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err)
+ return err
+}
+
+// WaitPIDArgs are arguments to the WaitPID method.
+type WaitPIDArgs struct {
+ // PID is the PID in the container's PID namespace.
+ PID int32
+
+ // CID is the container ID.
+ CID string
+
+ // ClearStatus determines whether the exit status of the process should
+ // be cleared when WaitPID returns.
+ ClearStatus bool
+}
+
+// WaitPID waits for the process with PID 'pid' in the sandbox.
+func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
+ log.Debugf("containerManager.Wait")
+ return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
+}
+
+// SignalDeliveryMode enumerates different signal delivery modes.
+type SignalDeliveryMode int
+
+const (
+ // DeliverToProcess delivers the signal to the container process with
+ // the specified PID. If PID is 0, then the container init process is
+ // signaled.
+ DeliverToProcess SignalDeliveryMode = iota
+
+ // DeliverToAllProcesses delivers the signal to all processes in the
+ // container. PID must be 0.
+ DeliverToAllProcesses
+
+ // DeliverToForegroundProcessGroup delivers the signal to the
+ // foreground process group in the same TTY session as the specified
+ // process. If PID is 0, then the signal is delivered to the foreground
+ // process group for the TTY for the init process.
+ DeliverToForegroundProcessGroup
+)
+
+func (s SignalDeliveryMode) String() string {
+ switch s {
+ case DeliverToProcess:
+ return "Process"
+ case DeliverToAllProcesses:
+ return "All"
+ case DeliverToForegroundProcessGroup:
+ return "Foreground Process Group"
+ }
+ return fmt.Sprintf("unknown signal delivery mode: %d", s)
+}
+
+// SignalArgs are arguments to the Signal method.
+type SignalArgs struct {
+ // CID is the container ID.
+ CID string
+
+ // Signo is the signal to send to the process.
+ Signo int32
+
+ // PID is the process ID in the given container that will be signaled.
+ // If 0, the root container will be signalled.
+ PID int32
+
+ // Mode is the signal delivery mode.
+ Mode SignalDeliveryMode
+}
+
+// Signal sends a signal to one or more processes in a container. If args.PID
+// is 0, then the container init process is used. Depending on the
+// args.SignalDeliveryMode option, the signal may be sent directly to the
+// indicated process, to all processes in the container, or to the foreground
+// process group.
+func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+ log.Debugf("containerManager.Signal %+v", args)
+ return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
+}
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
new file mode 100644
index 000000000..79f7387ac
--- /dev/null
+++ b/runsc/boot/debug.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+type debug struct {
+}
+
+// Stacks collects all sandbox stacks and copies them to 'stacks'.
+func (*debug) Stacks(_ *struct{}, stacks *string) error {
+ buf := log.Stacks(true)
+ *stacks = string(buf)
+ return nil
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
new file mode 100644
index 000000000..ffd99f5e9
--- /dev/null
+++ b/runsc/boot/events.go
@@ -0,0 +1,81 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// Event struct for encoding the event data to JSON. Corresponds to runc's
+// main.event struct.
+type Event struct {
+ Type string `json:"type"`
+ ID string `json:"id"`
+ Data interface{} `json:"data,omitempty"`
+}
+
+// Stats is the runc specific stats structure for stability when encoding and
+// decoding stats.
+type Stats struct {
+ Memory Memory `json:"memory"`
+ Pids Pids `json:"pids"`
+}
+
+// Pids contains stats on processes.
+type Pids struct {
+ Current uint64 `json:"current,omitempty"`
+ Limit uint64 `json:"limit,omitempty"`
+}
+
+// MemoryEntry contains stats on a kind of memory.
+type MemoryEntry struct {
+ Limit uint64 `json:"limit"`
+ Usage uint64 `json:"usage,omitempty"`
+ Max uint64 `json:"max,omitempty"`
+ Failcnt uint64 `json:"failcnt"`
+}
+
+// Memory contains stats on memory.
+type Memory struct {
+ Cache uint64 `json:"cache,omitempty"`
+ Usage MemoryEntry `json:"usage,omitempty"`
+ Swap MemoryEntry `json:"swap,omitempty"`
+ Kernel MemoryEntry `json:"kernel,omitempty"`
+ KernelTCP MemoryEntry `json:"kernelTCP,omitempty"`
+ Raw map[string]uint64 `json:"raw,omitempty"`
+}
+
+// Event gets the events from the container.
+func (cm *containerManager) Event(_ *struct{}, out *Event) error {
+ stats := &Stats{}
+ stats.populateMemory(cm.l.k)
+ stats.populatePIDs(cm.l.k)
+ *out = Event{Type: "stats", Data: stats}
+ return nil
+}
+
+func (s *Stats) populateMemory(k *kernel.Kernel) {
+ mem := k.MemoryFile()
+ mem.UpdateUsage()
+ _, totalUsage := usage.MemoryAccounting.Copy()
+ s.Memory.Usage = MemoryEntry{
+ Usage: totalUsage,
+ }
+}
+
+func (s *Stats) populatePIDs(k *kernel.Kernel) {
+ s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups()))
+}
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
new file mode 100644
index 000000000..4e428b49c
--- /dev/null
+++ b/runsc/boot/fds.go
@@ -0,0 +1,89 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// createFDMap creates an FD map that contains stdin, stdout, and stderr. If
+// console is true, then ioctl calls will be passed through to the host FD.
+// Upon success, createFDMap dups then closes stdioFDs.
+func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+ if len(stdioFDs) != 3 {
+ return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+ }
+
+ fdm := k.NewFDMap()
+ defer fdm.DecRef()
+ mounter := fs.FileOwnerFromContext(ctx)
+
+ // Maps sandbox FD to host FD.
+ fdMap := map[int]int{
+ 0: stdioFDs[0],
+ 1: stdioFDs[1],
+ 2: stdioFDs[2],
+ }
+
+ var ttyFile *fs.File
+ for appFD, hostFD := range fdMap {
+ var appFile *fs.File
+
+ if console && appFD < 3 {
+ // Import the file as a host TTY file.
+ if ttyFile == nil {
+ var err error
+ appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */)
+ if err != nil {
+ return nil, err
+ }
+ defer appFile.DecRef()
+
+ // Remember this in the TTY file, as we will
+ // use it for the other stdio FDs.
+ ttyFile = appFile
+ } else {
+ // Re-use the existing TTY file, as all three
+ // stdio FDs must point to the same fs.File in
+ // order to share TTY state, specifically the
+ // foreground process group id.
+ appFile = ttyFile
+ }
+ } else {
+ // Import the file as a regular host file.
+ var err error
+ appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */)
+ if err != nil {
+ return nil, err
+ }
+ defer appFile.DecRef()
+ }
+
+ // Add the file to the FD map.
+ if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+ return nil, err
+ }
+ }
+
+ fdm.IncRef()
+ return fdm, nil
+}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
new file mode 100644
index 000000000..652da1cef
--- /dev/null
+++ b/runsc/boot/filter/config.go
@@ -0,0 +1,493 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+ "os"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
+)
+
+// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
+var allowedSyscalls = seccomp.SyscallRules{
+ syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(linux.ARCH_GET_FS)},
+ {seccomp.AllowValue(linux.ARCH_SET_FS)},
+ },
+ syscall.SYS_CLOCK_GETTIME: {},
+ syscall.SYS_CLONE: []seccomp.Rule{
+ {
+ seccomp.AllowValue(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ },
+ },
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
+ syscall.SYS_EPOLL_CREATE1: {},
+ syscall.SYS_EPOLL_CTL: {},
+ syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_EVENTFD2: []seccomp.Rule{
+ {
+ seccomp.AllowValue(0),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_EXIT: {},
+ syscall.SYS_EXIT_GROUP: {},
+ syscall.SYS_FALLOCATE: {},
+ syscall.SYS_FCHMOD: {},
+ syscall.SYS_FCNTL: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_GETFL),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_SETFL),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.F_GETFD),
+ },
+ },
+ syscall.SYS_FSTAT: {},
+ syscall.SYS_FSYNC: {},
+ syscall.SYS_FTRUNCATE: {},
+ syscall.SYS_FUTEX: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_GETPID: {},
+ unix.SYS_GETRANDOM: {},
+ syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_DOMAIN),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_TYPE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_ERROR),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_SNDBUF),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_REUSEADDR),
+ },
+ },
+ syscall.SYS_GETTID: {},
+ syscall.SYS_GETTIMEOFDAY: {},
+ // SYS_IOCTL is needed for terminal support, but we only allow
+ // setting/getting termios and winsize.
+ syscall.SYS_IOCTL: []seccomp.Rule{
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TCGETS),
+ seccomp.AllowAny{}, /* termios struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TCSETS),
+ seccomp.AllowAny{}, /* termios struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TCSETSF),
+ seccomp.AllowAny{}, /* termios struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TCSETSW),
+ seccomp.AllowAny{}, /* termios struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TIOCSWINSZ),
+ seccomp.AllowAny{}, /* winsize struct */
+ },
+ {
+ seccomp.AllowAny{}, /* fd */
+ seccomp.AllowValue(linux.TIOCGWINSZ),
+ seccomp.AllowAny{}, /* winsize struct */
+ },
+ },
+ syscall.SYS_LSEEK: {},
+ syscall.SYS_MADVISE: {},
+ syscall.SYS_MINCORE: {},
+ syscall.SYS_MMAP: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_SHARED),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
+ seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+ },
+ },
+ syscall.SYS_MPROTECT: {},
+ syscall.SYS_MUNMAP: {},
+ syscall.SYS_NANOSLEEP: {},
+ syscall.SYS_POLL: {},
+ syscall.SYS_PREAD64: {},
+ syscall.SYS_PWRITE64: {},
+ syscall.SYS_READ: {},
+ syscall.SYS_RECVMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+ },
+ },
+ syscall.SYS_RECVMMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
+ seccomp.AllowValue(syscall.MSG_DONTWAIT),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_RESTART_SYSCALL: {},
+ syscall.SYS_RT_SIGACTION: {},
+ syscall.SYS_RT_SIGPROCMASK: {},
+ syscall.SYS_RT_SIGRETURN: {},
+ syscall.SYS_SCHED_YIELD: {},
+ syscall.SYS_SENDMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+ },
+ },
+ syscall.SYS_SETITIMER: {},
+ syscall.SYS_SHUTDOWN: []seccomp.Rule{
+ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+ },
+ syscall.SYS_SIGALTSTACK: {},
+ syscall.SYS_SYNC_FILE_RANGE: {},
+ syscall.SYS_TGKILL: []seccomp.Rule{
+ {
+ seccomp.AllowValue(uint64(os.Getpid())),
+ },
+ },
+ syscall.SYS_WRITE: {},
+ // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
+ // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
+ // option is enabled for a packet socket.
+ syscall.SYS_WRITEV: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(2),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(3),
+ },
+ },
+}
+
+// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
+func hostInetFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_ACCEPT4: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ },
+ },
+ syscall.SYS_BIND: {},
+ syscall.SYS_CONNECT: {},
+ syscall.SYS_GETPEERNAME: {},
+ syscall.SYS_GETSOCKNAME: {},
+ syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_V6ONLY),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_ERROR),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_KEEPALIVE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_SNDBUF),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_RCVBUF),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_REUSEADDR),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_TYPE),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_LINGER),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_TCP),
+ seccomp.AllowValue(syscall.TCP_NODELAY),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_TCP),
+ seccomp.AllowValue(syscall.TCP_INFO),
+ },
+ },
+ syscall.SYS_IOCTL: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.TIOCOUTQ),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.TIOCINQ),
+ },
+ },
+ syscall.SYS_LISTEN: {},
+ syscall.SYS_READV: {},
+ syscall.SYS_RECVFROM: {},
+ syscall.SYS_RECVMSG: {},
+ syscall.SYS_SENDMSG: {},
+ syscall.SYS_SENDTO: {},
+ syscall.SYS_SETSOCKOPT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_V6ONLY),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_SNDBUF),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_RCVBUF),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_REUSEADDR),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_TCP),
+ seccomp.AllowValue(syscall.TCP_NODELAY),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ },
+ syscall.SYS_SHUTDOWN: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SHUT_RD),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SHUT_WR),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SHUT_RDWR),
+ },
+ },
+ syscall.SYS_SOCKET: []seccomp.Rule{
+ {
+ seccomp.AllowValue(syscall.AF_INET),
+ seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowValue(syscall.AF_INET),
+ seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowValue(syscall.AF_INET6),
+ seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowValue(syscall.AF_INET6),
+ seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_WRITEV: {},
+ }
+}
+
+// ptraceFilters returns syscalls made exclusively by the ptrace platform.
+func ptraceFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ unix.SYS_GETCPU: {},
+ unix.SYS_SCHED_SETAFFINITY: {},
+ syscall.SYS_PTRACE: {},
+ syscall.SYS_TGKILL: {},
+ syscall.SYS_WAIT4: {},
+ }
+}
+
+// kvmFilters returns syscalls made exclusively by the KVM platform.
+func kvmFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_ARCH_PRCTL: {},
+ syscall.SYS_IOCTL: {},
+ syscall.SYS_MMAP: {},
+ syscall.SYS_RT_SIGSUSPEND: {},
+ syscall.SYS_RT_SIGTIMEDWAIT: {},
+ 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host.
+ }
+}
+
+func controlServerFilters(fd int) seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_ACCEPT: []seccomp.Rule{
+ {
+ seccomp.AllowValue(fd),
+ },
+ },
+ syscall.SYS_LISTEN: []seccomp.Rule{
+ {
+ seccomp.AllowValue(fd),
+ seccomp.AllowValue(16 /* unet.backlog */),
+ },
+ },
+ syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_SOCKET),
+ seccomp.AllowValue(syscall.SO_PEERCRED),
+ },
+ },
+ }
+}
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_OPENAT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+ },
+ },
+ }
+}
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
new file mode 100644
index 000000000..5c5ec4e06
--- /dev/null
+++ b/runsc/boot/filter/extra_filters.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go intrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() seccomp.SyscallRules {
+ return nil
+}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
new file mode 100644
index 000000000..ac5a0f1aa
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -0,0 +1,32 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+ Report("MSAN is enabled: syscall filters less restrictive!")
+ return seccomp.SyscallRules{
+ syscall.SYS_SCHED_GETAFFINITY: {},
+ syscall.SYS_SET_ROBUST_LIST: {},
+ }
+}
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
new file mode 100644
index 000000000..ba3c1ce87
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+ Report("TSAN is enabled: syscall filters less restrictive!")
+ return seccomp.SyscallRules{
+ syscall.SYS_BRK: {},
+ syscall.SYS_CLONE: {},
+ syscall.SYS_FUTEX: {},
+ syscall.SYS_MMAP: {},
+ syscall.SYS_MUNLOCK: {},
+ syscall.SYS_NANOSLEEP: {},
+ syscall.SYS_OPEN: {},
+ syscall.SYS_SET_ROBUST_LIST: {},
+ // Used within glibc's malloc.
+ syscall.SYS_TIME: {},
+ }
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
new file mode 100644
index 000000000..17479e0dd
--- /dev/null
+++ b/runsc/boot/filter/filter.go
@@ -0,0 +1,71 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the sandbox is allowed to make
+// to the host, and installs seccomp filters to prevent prohibited
+// syscalls in case it's compromised.
+package filter
+
+import (
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+)
+
+// Options are seccomp filter related options.
+type Options struct {
+ Platform platform.Platform
+ HostNetwork bool
+ ProfileEnable bool
+ ControllerFD int
+}
+
+// Install installs seccomp filters for based on the given platform.
+func Install(opt Options) error {
+ s := allowedSyscalls
+ s.Merge(controlServerFilters(opt.ControllerFD))
+
+ // Set of additional filters used by -race and -msan. Returns empty
+ // when not enabled.
+ s.Merge(instrumentationFilters())
+
+ if opt.HostNetwork {
+ Report("host networking enabled: syscall filters less restrictive!")
+ s.Merge(hostInetFilters())
+ }
+ if opt.ProfileEnable {
+ Report("profile enabled: syscall filters less restrictive!")
+ s.Merge(profileFilters())
+ }
+
+ switch p := opt.Platform.(type) {
+ case *ptrace.PTrace:
+ s.Merge(ptraceFilters())
+ case *kvm.KVM:
+ s.Merge(kvmFilters())
+ default:
+ return fmt.Errorf("unknown platform type %T", p)
+ }
+
+ return seccomp.Install(s)
+}
+
+// Report writes a warning message to the log.
+func Report(msg string) {
+ log.Warningf("*** SECCOMP WARNING: %s", msg)
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
new file mode 100644
index 000000000..4b1557b9a
--- /dev/null
+++ b/runsc/boot/fs.go
@@ -0,0 +1,774 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "path"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+
+ // Include filesystem types that OCI spec might mount.
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ // Filesystem name for 9p gofer mounts.
+ rootFsName = "9p"
+
+ // Device name for root mount.
+ rootDevice = "9pfs-/"
+
+ // ChildContainersDir is the directory where child container root
+ // filesystems are mounted.
+ ChildContainersDir = "/__runsc_containers__"
+
+ // Filesystems that runsc supports.
+ bind = "bind"
+ devpts = "devpts"
+ devtmpfs = "devtmpfs"
+ proc = "proc"
+ sysfs = "sysfs"
+ tmpfs = "tmpfs"
+ nonefs = "none"
+)
+
+type fdDispenser struct {
+ fds []int
+}
+
+func (f *fdDispenser) remove() int {
+ if f.empty() {
+ panic("fdDispenser out of fds")
+ }
+ rv := f.fds[0]
+ f.fds = f.fds[1:]
+ return rv
+}
+
+func (f *fdDispenser) empty() bool {
+ return len(f.fds) == 0
+}
+
+func adjustDirentCache(k *kernel.Kernel) error {
+ var hl syscall.Rlimit
+ if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+ return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
+ }
+ if int64(hl.Cur) != syscall.RLIM_INFINITY {
+ newSize := hl.Cur / 2
+ if newSize < gofer.DefaultDirentCacheSize {
+ log.Infof("Setting gofer dirent cache size to %d", newSize)
+ gofer.DefaultDirentCacheSize = newSize
+ k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+ }
+ }
+ return nil
+}
+
+// setupRootContainerFS creates a mount namespace containing the root filesystem
+// and all mounts. 'rootCtx' is used to walk directories to find mount points.
+// 'setMountNS' is called after namespace is created. It must set the mount NS
+// to 'rootCtx'.
+func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error {
+ mounts := compileMounts(spec)
+
+ // Create a tmpfs mount where we create and mount a root filesystem for
+ // each child container.
+ mounts = append(mounts, specs.Mount{
+ Type: tmpfs,
+ Destination: ChildContainersDir,
+ })
+
+ fds := &fdDispenser{fds: goferFDs}
+ rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
+ if err != nil {
+ return fmt.Errorf("creating root mount: %v", err)
+ }
+ mns, err := fs.NewMountNamespace(userCtx, rootInode)
+ if err != nil {
+ return fmt.Errorf("creating root mount namespace: %v", err)
+ }
+ setMountNS(mns)
+
+ root := mns.Root()
+ defer root.DecRef()
+ return mountSubmounts(rootCtx, conf, mns, root, mounts, fds)
+}
+
+// compileMounts returns the supported mounts from the mount spec, adding any
+// mandatory mounts that are required by the OCI specification.
+func compileMounts(spec *specs.Spec) []specs.Mount {
+ // Keep track of whether proc and sys were mounted.
+ var procMounted, sysMounted bool
+ var mounts []specs.Mount
+
+ // Always mount /dev.
+ mounts = append(mounts, specs.Mount{
+ Type: devtmpfs,
+ Destination: "/dev",
+ })
+
+ mounts = append(mounts, specs.Mount{
+ Type: devpts,
+ Destination: "/dev/pts",
+ })
+
+ // Mount all submounts from the spec.
+ for _, m := range spec.Mounts {
+ if !specutils.IsSupportedDevMount(m) {
+ log.Warningf("ignoring dev mount at %q", m.Destination)
+ continue
+ }
+ mounts = append(mounts, m)
+ switch filepath.Clean(m.Destination) {
+ case "/proc":
+ procMounted = true
+ case "/sys":
+ sysMounted = true
+ }
+ }
+
+ // Mount proc and sys even if the user did not ask for it, as the spec
+ // says we SHOULD.
+ var mandatoryMounts []specs.Mount
+ if !procMounted {
+ mandatoryMounts = append(mandatoryMounts, specs.Mount{
+ Type: proc,
+ Destination: "/proc",
+ })
+ }
+ if !sysMounted {
+ mandatoryMounts = append(mandatoryMounts, specs.Mount{
+ Type: sysfs,
+ Destination: "/sys",
+ })
+ }
+
+ // The mandatory mounts should be ordered right after the root, in case
+ // there are submounts of these mandatory mounts already in the spec.
+ mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
+
+ return mounts
+}
+
+// createRootMount creates the root filesystem.
+func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
+ // First construct the filesystem from the spec.Root.
+ mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay}
+
+ var (
+ rootInode *fs.Inode
+ err error
+ )
+
+ fd := fds.remove()
+ log.Infof("Mounting root over 9P, ioFD: %d", fd)
+ p9FS := mustFindFilesystem("9p")
+ opts := p9MountOptions(fd, conf.FileAccess)
+ rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating root mount point: %v", err)
+ }
+
+ // We need to overlay the root on top of a ramfs with stub directories
+ // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
+ // mounted even if they are not in the spec.
+ submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
+ rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+ if err != nil {
+ return nil, fmt.Errorf("adding submount overlay: %v", err)
+ }
+
+ if conf.Overlay && !spec.Root.Readonly {
+ log.Debugf("Adding overlay on top of root mount")
+ // Overlay a tmpfs filesystem on top of the root.
+ rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ log.Infof("Mounted %q to %q type root", spec.Root.Path, "/")
+ return rootInode, nil
+}
+
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+ // Upper layer uses the same flags as lower, but it must be read-write.
+ lowerFlags.ReadOnly = false
+
+ tmpFS := mustFindFilesystem("tmpfs")
+ if !fs.IsDir(lower.StableAttr) {
+ // Create overlay on top of mount file, e.g. /etc/hostname.
+ msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
+ return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
+ }
+
+ // Create overlay on top of mount dir.
+ upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
+ }
+ return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
+ var (
+ fsName string
+ opts []string
+ useOverlay bool
+ err error
+ )
+
+ switch m.Type {
+ case devpts, devtmpfs, proc, sysfs:
+ fsName = m.Type
+ case nonefs:
+ fsName = sysfs
+ case tmpfs:
+ fsName = m.Type
+
+ // tmpfs has some extra supported options that we must pass through.
+ opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+
+ case bind:
+ fd := fds.remove()
+ fsName = "9p"
+ // Non-root bind mounts are always shared.
+ opts = p9MountOptions(fd, FileAccessShared)
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+ default:
+ // TODO(nlacasse): Support all the mount types and make this a
+ // fatal error. Most applications will "just work" without
+ // them, so this is a warning for now.
+ // we do not support.
+ log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ }
+ return fsName, opts, useOverlay, err
+}
+
+func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error {
+ for _, m := range mounts {
+ if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil {
+ return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ }
+ }
+
+ if err := mountTmp(ctx, conf, mns, root, mounts); err != nil {
+ return fmt.Errorf("mount submount %q: %v", "tmp", err)
+ }
+
+ if !fds.empty() {
+ return fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
+ }
+ return nil
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+
+ // Return the error or nil that corresponds to the default case in getMountNameAndOptions.
+ if err != nil {
+ return err
+ }
+ if fsName == "" {
+ return nil
+ }
+
+ // All filesystem names should have been mapped to something we know.
+ filesystem := mustFindFilesystem(fsName)
+
+ mf := mountFlags(m.Options)
+ if useOverlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ mf.ReadOnly = true
+ }
+
+ inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+ }
+
+ // If there are submounts, we need to overlay the mount on top of a
+ // ramfs with stub directories for submount paths.
+ submounts := subtargets(m.Destination, mounts)
+ if len(submounts) > 0 {
+ log.Infof("Adding submount overlay over %q", m.Destination)
+ inode, err = addSubmountOverlay(ctx, inode, submounts)
+ if err != nil {
+ return fmt.Errorf("adding submount overlay: %v", err)
+ }
+ }
+
+ if useOverlay {
+ log.Debugf("Adding overlay on top of mount %q", m.Destination)
+ inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+ if err != nil {
+ return err
+ }
+ }
+
+ maxTraversals := uint(0)
+ dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+ }
+ defer dirent.DecRef()
+ if err := mns.Mount(ctx, dirent, inode); err != nil {
+ return fmt.Errorf("mount %q error: %v", m.Destination, err)
+ }
+
+ log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+ return nil
+}
+
+// p9MountOptions creates a slice of options for a p9 mount.
+func p9MountOptions(fd int, fa FileAccessType) []string {
+ opts := []string{
+ "trans=fd",
+ "rfdno=" + strconv.Itoa(fd),
+ "wfdno=" + strconv.Itoa(fd),
+ "privateunixsocket=true",
+ }
+ if fa == FileAccessShared {
+ opts = append(opts, "cache=remote_revalidating")
+ }
+ return opts
+}
+
+// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
+// keys.
+func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
+ var out []string
+ for _, o := range opts {
+ kv := strings.Split(o, "=")
+ switch len(kv) {
+ case 1:
+ if specutils.ContainsStr(allowedKeys, o) {
+ out = append(out, o)
+ continue
+ }
+ log.Warningf("ignoring unsupported key %q", kv)
+ case 2:
+ if specutils.ContainsStr(allowedKeys, kv[0]) {
+ out = append(out, o)
+ continue
+ }
+ log.Warningf("ignoring unsupported key %q", kv[0])
+ default:
+ return nil, fmt.Errorf("invalid option %q", o)
+ }
+ }
+ return out, nil
+}
+
+// mountDevice returns a device string based on the fs type and target
+// of the mount.
+func mountDevice(m specs.Mount) string {
+ if m.Type == bind {
+ // Make a device string that includes the target, which is consistent across
+ // S/R and uniquely identifies the connection.
+ return "9pfs-" + m.Destination
+ }
+ // All other fs types use device "none".
+ return "none"
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
+ fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+
+ // Return the error or nil that corresponds to the default case in getMountNameAndOptions.
+ if err != nil {
+ return err
+ }
+ // TODO(nlacasse): Fix this when we support all the mount types and
+ // make this a fatal error.
+ if fsName == "" {
+ return nil
+ }
+
+ newMount := fs.MountArgs{
+ Dev: mountDevice(m),
+ Flags: mountFlags(m.Options),
+ DataString: strings.Join(opts, ","),
+ }
+ if useOverlay {
+ newMount.Flags.ReadOnly = true
+ }
+ renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+ log.Infof("Added mount at %q: %+v", fsName, newMount)
+ return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
+// to the environment.
+func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
+ renv := &fs.RestoreEnvironment{
+ MountSources: make(map[string][]fs.MountArgs),
+ }
+
+ // Add root mount.
+ fd := fds.remove()
+ opts := p9MountOptions(fd, conf.FileAccess)
+
+ mf := fs.MountSourceFlags{}
+ if spec.Root.Readonly || conf.Overlay {
+ mf.ReadOnly = true
+ }
+
+ rootMount := fs.MountArgs{
+ Dev: rootDevice,
+ Flags: mf,
+ DataString: strings.Join(opts, ","),
+ }
+ renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+
+ // Add submounts.
+ var tmpMounted bool
+ for _, m := range compileMounts(spec) {
+ if err := addRestoreMount(conf, renv, m, fds); err != nil {
+ return nil, err
+ }
+ if filepath.Clean(m.Destination) == "/tmp" {
+ tmpMounted = true
+ }
+ }
+
+ // TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+ if !tmpMounted {
+ tmpMount := specs.Mount{
+ Type: tmpfs,
+ Destination: "/tmp",
+ }
+ if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil {
+ return nil, err
+ }
+ }
+
+ return renv, nil
+}
+
+func mountFlags(opts []string) fs.MountSourceFlags {
+ mf := fs.MountSourceFlags{}
+ for _, o := range opts {
+ switch o {
+ case "rw":
+ mf.ReadOnly = false
+ case "ro":
+ mf.ReadOnly = true
+ case "noatime":
+ mf.NoAtime = true
+ case "noexec":
+ mf.NoExec = true
+ default:
+ log.Warningf("ignoring unknown mount option %q", o)
+ }
+ }
+ return mf
+}
+
+func mustFindFilesystem(name string) fs.Filesystem {
+ fs, ok := fs.FindFilesystem(name)
+ if !ok {
+ panic(fmt.Sprintf("could not find filesystem %q", name))
+ }
+ return fs
+}
+
+// addSubmountOverlay overlays the inode over a ramfs tree containing the given
+// paths.
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+ msrc := fs.NewPseudoMountSource()
+ mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
+ if err != nil {
+ return nil, fmt.Errorf("creating mount tree: %v", err)
+ }
+ overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+ if err != nil {
+ return nil, fmt.Errorf("adding mount overlay: %v", err)
+ }
+ return overlayInode, err
+}
+
+// subtargets takes a set of Mounts and returns only the targets that are
+// children of the given root. The returned paths are relative to the root.
+func subtargets(root string, mnts []specs.Mount) []string {
+ var targets []string
+ for _, mnt := range mnts {
+ if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
+ targets = append(targets, relPath)
+ }
+ }
+ return targets
+}
+
+// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
+// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
+func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+ ctx := procArgs.NewContext(k)
+
+ // Create the FD map, which will set stdin, stdout, and stderr. If console
+ // is true, then ioctl calls will be passed through to the host fd.
+ fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
+ if err != nil {
+ return fmt.Errorf("importing fds: %v", err)
+ }
+
+ // CreateProcess takes a reference on FDMap if successful. We
+ // won't need ours either way.
+ procArgs.FDMap = fdm
+
+ // Use root user to configure mounts. The current user might not have
+ // permission to do so.
+ rootProcArgs := kernel.CreateProcessArgs{
+ WorkingDirectory: "/",
+ Credentials: auth.NewRootCredentials(creds.UserNamespace),
+ Umask: 0022,
+ MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+ }
+ rootCtx := rootProcArgs.NewContext(k)
+
+ // If this is the root container, we also need to setup the root mount
+ // namespace.
+ mns := k.RootMountNamespace()
+ if mns == nil {
+ // Setup the root container.
+ return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) {
+ k.SetRootMountNamespace(mns)
+ })
+ }
+
+ // Setup a child container.
+ log.Infof("Creating new process in child container.")
+ globalRoot := mns.Root()
+ defer globalRoot.DecRef()
+
+ // Create mount point for the container's rootfs.
+ maxTraversals := uint(0)
+ contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
+ }
+ if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil {
+ return fmt.Errorf("create directory %q: %v", cid, err)
+ }
+ containerRoot, err := contDir.Walk(ctx, globalRoot, cid)
+ if err != nil {
+ return fmt.Errorf("walk to %q failed: %v", cid, err)
+ }
+ defer containerRoot.DecRef()
+
+ // Create the container's root filesystem mount.
+ fds := &fdDispenser{fds: goferFDs}
+ rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+ if err != nil {
+ return fmt.Errorf("creating filesystem for container: %v", err)
+ }
+
+ // Mount the container's root filesystem to the newly created mount point.
+ if err := mns.Mount(ctx, containerRoot, rootInode); err != nil {
+ return fmt.Errorf("mount container root: %v", err)
+ }
+
+ // We have to re-walk to the dirent to find the mounted
+ // directory. The old dirent is invalid at this point.
+ containerRoot, err = contDir.Walk(ctx, globalRoot, cid)
+ if err != nil {
+ return fmt.Errorf("find container mount point %q: %v", cid, err)
+ }
+ cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
+ defer cu.Clean()
+
+ log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
+
+ // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
+ procArgs.Root = containerRoot
+
+ // Mount all submounts.
+ mounts := compileMounts(spec)
+ if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil {
+ return err
+ }
+ cu.Release()
+ return nil
+}
+
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+ paths := fs.GetPath(procArgs.Envv)
+ exe := procArgs.Argv[0]
+ f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+ if err != nil {
+ return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+ }
+ procArgs.Filename = f
+ return nil
+}
+
+// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
+// given container and deleting the container root directory.
+func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
+ defer func() {
+ // Flushing dirent references triggers many async close
+ // operations. We must wait for those to complete before
+ // returning, otherwise the caller may kill the gofer before
+ // they complete, causing a cascade of failing RPCs.
+ //
+ // This must take place in the first deferred function, so that
+ // it runs after all the other deferred DecRef() calls in this
+ // function.
+ log.Infof("Waiting for async filesystem operations to complete")
+ fs.AsyncBarrier()
+ }()
+
+ // First get a reference to the container root directory.
+ mns := k.RootMountNamespace()
+ mnsRoot := mns.Root()
+ defer mnsRoot.DecRef()
+ containerRoot := path.Join(ChildContainersDir, cid)
+ maxTraversals := uint(0)
+ containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals)
+ if err == syserror.ENOENT {
+ // Container must have been destroyed already. That's fine.
+ return nil
+ }
+ if err != nil {
+ return fmt.Errorf("finding container root directory %q: %v", containerRoot, err)
+ }
+ defer containerRootDirent.DecRef()
+
+ // Iterate through all submounts and unmount them. We unmount lazily by
+ // setting detach=true, so we can unmount in any order.
+ mnt := mns.FindMount(containerRootDirent)
+ for _, m := range mns.AllMountsUnder(mnt) {
+ root := m.Root()
+ defer root.DecRef()
+
+ // Do a best-effort unmount by flushing the refs and unmount
+ // with "detach only = true". Unmount returns EINVAL when the mount point
+ // doesn't exist, i.e. it has already been unmounted.
+ log.Debugf("Unmounting container mount %q", root.BaseName())
+ root.Inode.MountSource.FlushDirentRefs()
+ if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
+ return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err)
+ }
+ }
+
+ // Get a reference to the parent directory and remove the root
+ // container directory.
+ maxTraversals = 0
+ containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("finding containers directory %q: %v", ChildContainersDir, err)
+ }
+ defer containersDirDirent.DecRef()
+ log.Debugf("Deleting container root %q", containerRoot)
+ if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
+ return fmt.Errorf("removing directory %q: %v", containerRoot, err)
+ }
+
+ return nil
+}
+
+// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+// 1. /tmp is mounted explictly: we should not override user's wish
+// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error {
+ for _, m := range mounts {
+ if filepath.Clean(m.Destination) == "/tmp" {
+ log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
+ return nil
+ }
+ }
+
+ maxTraversals := uint(0)
+ tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
+ switch err {
+ case nil:
+ // Found '/tmp' in filesystem, check if it's empty.
+ defer tmp.DecRef()
+ f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
+ if err != nil {
+ return err
+ }
+ defer f.DecRef()
+ serializer := &fs.CollectEntriesSerializer{}
+ if err := f.Readdir(ctx, serializer); err != nil {
+ return err
+ }
+ // If more than "." and ".." is found, skip internal tmpfs to prevent hiding
+ // existing files.
+ if len(serializer.Order) > 2 {
+ log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
+ return nil
+ }
+ log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
+ fallthrough
+
+ case syserror.ENOENT:
+ // No '/tmp' found (or fallthrough from above). Safe to mount internal
+ // tmpfs.
+ tmpMount := specs.Mount{
+ Type: tmpfs,
+ Destination: "/tmp",
+ // Sticky bit is added to prevent accidental deletion of files from
+ // another user. This is normally done for /tmp.
+ Options: []string{"mode=1777"},
+ }
+ return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts)
+
+ default:
+ return err
+ }
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
new file mode 100644
index 000000000..3364aa5e6
--- /dev/null
+++ b/runsc/boot/limits.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "sync"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// Mapping from linux resource names to limits.LimitType.
+var fromLinuxResource = map[string]limits.LimitType{
+ "RLIMIT_AS": limits.AS,
+ "RLIMIT_CORE": limits.Core,
+ "RLIMIT_CPU": limits.CPU,
+ "RLIMIT_DATA": limits.Data,
+ "RLIMIT_FSIZE": limits.FileSize,
+ "RLIMIT_LOCKS": limits.Locks,
+ "RLIMIT_MEMLOCK": limits.MemoryLocked,
+ "RLIMIT_MSGQUEUE": limits.MessageQueueBytes,
+ "RLIMIT_NICE": limits.Nice,
+ "RLIMIT_NOFILE": limits.NumberOfFiles,
+ "RLIMIT_NPROC": limits.ProcessCount,
+ "RLIMIT_RSS": limits.Rss,
+ "RLIMIT_RTPRIO": limits.RealTimePriority,
+ "RLIMIT_RTTIME": limits.Rttime,
+ "RLIMIT_SIGPENDING": limits.SignalsPending,
+ "RLIMIT_STACK": limits.Stack,
+}
+
+func findName(lt limits.LimitType) string {
+ for k, v := range fromLinuxResource {
+ if v == lt {
+ return k
+ }
+ }
+ return "unknown"
+}
+
+var defaults defs
+
+type defs struct {
+ mu sync.Mutex
+ set *limits.LimitSet
+ err error
+}
+
+func (d *defs) get() (*limits.LimitSet, error) {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
+ if d.err != nil {
+ return nil, d.err
+ }
+ if d.set == nil {
+ if err := d.initDefaults(); err != nil {
+ d.err = err
+ return nil, err
+ }
+ }
+ return d.set, nil
+}
+
+func (d *defs) initDefaults() error {
+ ls, err := limits.NewLinuxLimitSet()
+ if err != nil {
+ return err
+ }
+
+ // Set default limits based on what containers get by default, ex:
+ // $ docker run --rm debian prlimit
+ ls.SetUnchecked(limits.AS, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.Core, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.CPU, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
+ ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
+ ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
+ ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
+ ls.SetUnchecked(limits.ProcessCount, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.Rss, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.RealTimePriority, limits.Limit{Cur: 0, Max: 0})
+ ls.SetUnchecked(limits.Rttime, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+ ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0})
+ ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity})
+
+ // Read host limits that directly affect the sandbox and adjust the defaults
+ // based on them.
+ for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} {
+ var hl syscall.Rlimit
+ if err := syscall.Getrlimit(res, &hl); err != nil {
+ return err
+ }
+
+ lt, ok := limits.FromLinuxResource[res]
+ if !ok {
+ return fmt.Errorf("unknown rlimit type %v", res)
+ }
+ hostLimit := limits.Limit{
+ Cur: limits.FromLinux(hl.Cur),
+ Max: limits.FromLinux(hl.Max),
+ }
+
+ defaultLimit := ls.Get(lt)
+ if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur {
+ log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur)
+ }
+ if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max {
+ log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max)
+ ls.SetUnchecked(lt, hostLimit)
+ }
+ }
+
+ d.set = ls
+ return nil
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+ ls, err := defaults.get()
+ if err != nil {
+ return nil, err
+ }
+
+ // Then apply overwrites on top of defaults.
+ for _, rl := range spec.Process.Rlimits {
+ lt, ok := fromLinuxResource[rl.Type]
+ if !ok {
+ return nil, fmt.Errorf("unknown resource %q", rl.Type)
+ }
+ ls.SetUnchecked(lt, limits.Limit{
+ Cur: rl.Soft,
+ Max: rl.Hard,
+ })
+ }
+ return ls, nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
new file mode 100644
index 000000000..6ac6b94dd
--- /dev/null
+++ b/runsc/boot/loader.go
@@ -0,0 +1,954 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package boot loads the kernel and runs a container.
+package boot
+
+import (
+ "fmt"
+ mrand "math/rand"
+ "os"
+ "runtime"
+ "sync"
+ "sync/atomic"
+ "syscall"
+ gtime "time"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/rand"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
+ slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
+ "gvisor.googlesource.com/gvisor/runsc/boot/filter"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+
+ // Include supported socket providers.
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+)
+
+// Loader keeps state needed to start the kernel and run the container..
+type Loader struct {
+ // k is the kernel.
+ k *kernel.Kernel
+
+ // ctrl is the control server.
+ ctrl *controller
+
+ conf *Config
+
+ // console is set to true if terminal is enabled.
+ console bool
+
+ watchdog *watchdog.Watchdog
+
+ // stdioFDs contains stdin, stdout, and stderr.
+ stdioFDs []int
+
+ // goferFDs are the FDs that attach the sandbox to the gofers.
+ goferFDs []int
+
+ // spec is the base configuration for the root container.
+ spec *specs.Spec
+
+ // startSignalForwarding enables forwarding of signals to the sandboxed
+ // container. It should be called after the init process is loaded.
+ startSignalForwarding func() func()
+
+ // stopSignalForwarding disables forwarding of signals to the sandboxed
+ // container. It should be called when a sandbox is destroyed.
+ stopSignalForwarding func()
+
+ // restore is set to true if we are restoring a container.
+ restore bool
+
+ // rootProcArgs refers to the root sandbox init task.
+ rootProcArgs kernel.CreateProcessArgs
+
+ // sandboxID is the ID for the whole sandbox.
+ sandboxID string
+
+ // mu guards processes.
+ mu sync.Mutex
+
+ // processes maps containers init process and invocation of exec. Root
+ // processes are keyed with container ID and pid=0, while exec invocations
+ // have the corresponding pid set.
+ //
+ // processes is guardded by mu.
+ processes map[execID]*execProcess
+}
+
+// execID uniquely identifies a sentry process that is executed in a container.
+type execID struct {
+ cid string
+ pid kernel.ThreadID
+}
+
+// execProcess contains the thread group and host TTY of a sentry process.
+type execProcess struct {
+ // tg will be nil for containers that haven't started yet.
+ tg *kernel.ThreadGroup
+
+ // tty will be nil if the process is not attached to a terminal.
+ tty *host.TTYFileOperations
+}
+
+func init() {
+ // Initialize the random number generator.
+ mrand.Seed(gtime.Now().UnixNano())
+
+ // Register the global syscall table.
+ kernel.RegisterSyscallTable(slinux.AMD64)
+}
+
+// Args are the arguments for New().
+type Args struct {
+ // Id is the sandbox ID.
+ ID string
+ // Spec is the sandbox specification.
+ Spec *specs.Spec
+ // Conf is the system configuration.
+ Conf *Config
+ // ControllerFD is the FD to the URPC controller.
+ ControllerFD int
+ // Device is an optional argument that is passed to the platform.
+ Device *os.File
+ // GoferFDs is an array of FDs used to connect with the Gofer.
+ GoferFDs []int
+ // StdioFDs is the stdio for the application.
+ StdioFDs []int
+ // Console is set to true if using TTY.
+ Console bool
+ // NumCPU is the number of CPUs to create inside the sandbox.
+ NumCPU int
+ // TotalMem is the initial amount of total memory to report back to the
+ // container.
+ TotalMem uint64
+ // UserLogFD is the file descriptor to write user logs to.
+ UserLogFD int
+}
+
+// New initializes a new kernel loader configured by spec.
+// New also handles setting up a kernel for restoring a container.
+func New(args Args) (*Loader, error) {
+ // We initialize the rand package now to make sure /dev/urandom is pre-opened
+ // on kernels that do not support getrandom(2).
+ if err := rand.Init(); err != nil {
+ return nil, fmt.Errorf("setting up rand: %v", err)
+ }
+
+ if err := usage.Init(); err != nil {
+ return nil, fmt.Errorf("setting up memory usage: %v", err)
+ }
+
+ // Create kernel and platform.
+ p, err := createPlatform(args.Conf, args.Device)
+ if err != nil {
+ return nil, fmt.Errorf("creating platform: %v", err)
+ }
+ k := &kernel.Kernel{
+ Platform: p,
+ }
+
+ // Create memory file.
+ mf, err := createMemoryFile()
+ if err != nil {
+ return nil, fmt.Errorf("creating memory file: %v", err)
+ }
+ k.SetMemoryFile(mf)
+
+ // Create VDSO.
+ //
+ // Pass k as the platform since it is savable, unlike the actual platform.
+ vdso, err := loader.PrepareVDSO(k)
+ if err != nil {
+ return nil, fmt.Errorf("creating vdso: %v", err)
+ }
+
+ // Create timekeeper.
+ tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+ if err != nil {
+ return nil, fmt.Errorf("creating timekeeper: %v", err)
+ }
+ tk.SetClocks(time.NewCalibratedClocks())
+
+ if err := enableStrace(args.Conf); err != nil {
+ return nil, fmt.Errorf("enabling strace: %v", err)
+ }
+
+ // Create an empty network stack because the network namespace may be empty at
+ // this point. Netns is configured before Run() is called. Netstack is
+ // configured using a control uRPC message. Host network is configured inside
+ // Run().
+ networkStack, err := newEmptyNetworkStack(args.Conf, k)
+ if err != nil {
+ return nil, fmt.Errorf("creating network: %v", err)
+ }
+
+ // Create capabilities.
+ caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
+ if err != nil {
+ return nil, fmt.Errorf("converting capabilities: %v", err)
+ }
+
+ // Convert the spec's additional GIDs to KGIDs.
+ extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
+ for _, GID := range args.Spec.Process.User.AdditionalGids {
+ extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+ }
+
+ // Create credentials.
+ creds := auth.NewUserCredentials(
+ auth.KUID(args.Spec.Process.User.UID),
+ auth.KGID(args.Spec.Process.User.GID),
+ extraKGIDs,
+ caps,
+ auth.NewRootUserNamespace())
+
+ if args.NumCPU == 0 {
+ args.NumCPU = runtime.NumCPU()
+ }
+ log.Infof("CPUs: %d", args.NumCPU)
+
+ if args.TotalMem > 0 {
+ // Adjust the total memory returned by the Sentry so that applications that
+ // use /proc/meminfo can make allocations based on this limit.
+ usage.MinimumTotalMemoryBytes = args.TotalMem
+ log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30))
+ }
+
+ // Initiate the Kernel object, which is required by the Context passed
+ // to createVFS in order to mount (among other things) procfs.
+ if err = k.Init(kernel.InitKernelArgs{
+ FeatureSet: cpuid.HostFeatureSet(),
+ Timekeeper: tk,
+ RootUserNamespace: creds.UserNamespace,
+ NetworkStack: networkStack,
+ ApplicationCores: uint(args.NumCPU),
+ Vdso: vdso,
+ RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
+ RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace),
+ RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+ }); err != nil {
+ return nil, fmt.Errorf("initializing kernel: %v", err)
+ }
+
+ if err := adjustDirentCache(k); err != nil {
+ return nil, err
+ }
+
+ // Turn on packet logging if enabled.
+ if args.Conf.LogPackets {
+ log.Infof("Packet logging enabled")
+ atomic.StoreUint32(&sniffer.LogPackets, 1)
+ } else {
+ log.Infof("Packet logging disabled")
+ atomic.StoreUint32(&sniffer.LogPackets, 0)
+ }
+
+ // Create a watchdog.
+ watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+
+ procArgs, err := newProcess(args.ID, args.Spec, creds, k)
+ if err != nil {
+ return nil, fmt.Errorf("creating init process for root container: %v", err)
+ }
+
+ if err := initCompatLogs(args.UserLogFD); err != nil {
+ return nil, fmt.Errorf("initializing compat logs: %v", err)
+ }
+
+ eid := execID{cid: args.ID}
+ l := &Loader{
+ k: k,
+ conf: args.Conf,
+ console: args.Console,
+ watchdog: watchdog,
+ spec: args.Spec,
+ goferFDs: args.GoferFDs,
+ stdioFDs: args.StdioFDs,
+ rootProcArgs: procArgs,
+ sandboxID: args.ID,
+ processes: map[execID]*execProcess{eid: {}},
+ }
+
+ // We don't care about child signals; some platforms can generate a
+ // tremendous number of useless ones (I'm looking at you, ptrace).
+ if err := sighandling.IgnoreChildStop(); err != nil {
+ return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
+ }
+
+ // Handle signals by forwarding them to the root container process
+ // (except for panic signal, which should cause a panic).
+ l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
+ // Panic signal should cause a panic.
+ if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
+ panic("Signal-induced panic")
+ }
+
+ // Otherwise forward to root container.
+ deliveryMode := DeliverToProcess
+ if args.Console {
+ // Since we are running with a console, we should
+ // forward the signal to the foreground process group
+ // so that job control signals like ^C can be handled
+ // properly.
+ deliveryMode = DeliverToForegroundProcessGroup
+ }
+ log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+ if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
+ log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
+ }
+ })
+
+ // Create the control server using the provided FD.
+ //
+ // This must be done *after* we have initialized the kernel since the
+ // controller is used to configure the kernel's network stack.
+ ctrl, err := newController(args.ControllerFD, l)
+ if err != nil {
+ return nil, fmt.Errorf("creating control server: %v", err)
+ }
+ l.ctrl = ctrl
+
+ // Only start serving after Loader is set to controller and controller is set
+ // to Loader, because they are both used in the urpc methods.
+ if err := ctrl.srv.StartServing(); err != nil {
+ return nil, fmt.Errorf("starting control server: %v", err)
+ }
+
+ return l, nil
+}
+
+// newProcess creates a process that can be run with kernel.CreateProcess.
+func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+ // Create initial limits.
+ ls, err := createLimitSet(spec)
+ if err != nil {
+ return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
+ }
+
+ // Create the process arguments.
+ procArgs := kernel.CreateProcessArgs{
+ Argv: spec.Process.Args,
+ Envv: spec.Process.Env,
+ WorkingDirectory: spec.Process.Cwd, // Defaults to '/' if empty.
+ Credentials: creds,
+ Umask: 0022,
+ Limits: ls,
+ MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+ UTSNamespace: k.RootUTSNamespace(),
+ IPCNamespace: k.RootIPCNamespace(),
+ AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
+ ContainerID: id,
+ }
+ return procArgs, nil
+}
+
+// Destroy cleans up all resources used by the loader.
+//
+// Note that this will block until all open control server connections have
+// been closed. For that reason, this should NOT be called in a defer, because
+// a panic in a control server rpc would then hang forever.
+func (l *Loader) Destroy() {
+ if l.ctrl != nil {
+ l.ctrl.srv.Stop()
+ }
+ if l.stopSignalForwarding != nil {
+ l.stopSignalForwarding()
+ }
+ l.watchdog.Stop()
+}
+
+func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+ switch conf.Platform {
+ case PlatformPtrace:
+ log.Infof("Platform: ptrace")
+ return ptrace.New()
+ case PlatformKVM:
+ log.Infof("Platform: kvm")
+ if deviceFile == nil {
+ return nil, fmt.Errorf("kvm device file must be provided")
+ }
+ return kvm.New(deviceFile)
+ default:
+ return nil, fmt.Errorf("invalid platform %v", conf.Platform)
+ }
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+ const memfileName = "runsc-memory"
+ memfd, err := memutil.CreateMemFD(memfileName, 0)
+ if err != nil {
+ return nil, fmt.Errorf("error creating memfd: %v", err)
+ }
+ memfile := os.NewFile(uintptr(memfd), memfileName)
+ mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+ if err != nil {
+ memfile.Close()
+ return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+ }
+ return mf, nil
+}
+
+// Run runs the root container..
+func (l *Loader) Run() error {
+ err := l.run()
+ l.ctrl.manager.startResultChan <- err
+ if err != nil {
+ // Give the controller some time to send the error to the
+ // runtime. If we return too quickly here the process will exit
+ // and the control connection will be closed before the error
+ // is returned.
+ gtime.Sleep(2 * gtime.Second)
+ return err
+ }
+ return nil
+}
+
+func (l *Loader) run() error {
+ if l.conf.Network == NetworkHost {
+ // Delay host network configuration to this point because network namespace
+ // is configured after the loader is created and before Run() is called.
+ log.Debugf("Configuring host network")
+ stack := l.k.NetworkStack().(*hostinet.Stack)
+ if err := stack.Configure(); err != nil {
+ return err
+ }
+ }
+
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ eid := execID{cid: l.sandboxID}
+ ep, ok := l.processes[eid]
+ if !ok {
+ return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
+ }
+
+ // Finally done with all configuration. Setup filters before user code
+ // is loaded.
+ if l.conf.DisableSeccomp {
+ filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+ } else {
+ opts := filter.Options{
+ Platform: l.k.Platform,
+ HostNetwork: l.conf.Network == NetworkHost,
+ ProfileEnable: l.conf.ProfileEnable,
+ ControllerFD: l.ctrl.srv.FD(),
+ }
+ if err := filter.Install(opts); err != nil {
+ return fmt.Errorf("installing seccomp filters: %v", err)
+ }
+ }
+
+ // If we are restoring, we do not want to create a process.
+ // l.restore is set by the container manager when a restore call is made.
+ if !l.restore {
+ if err := setupContainerFS(
+ &l.rootProcArgs,
+ l.spec,
+ l.conf,
+ l.stdioFDs,
+ l.goferFDs,
+ l.console,
+ l.rootProcArgs.Credentials,
+ l.rootProcArgs.Limits,
+ l.k,
+ "" /* CID, which isn't needed for the root container */); err != nil {
+ return err
+ }
+
+ rootCtx := l.rootProcArgs.NewContext(l.k)
+ rootMns := l.k.RootMountNamespace()
+ if err := setExecutablePath(rootCtx, rootMns, &l.rootProcArgs); err != nil {
+ return err
+ }
+
+ // Create the root container init task. It will begin running
+ // when the kernel is started.
+ if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+ return fmt.Errorf("creating init process: %v", err)
+ }
+
+ // CreateProcess takes a reference on FDMap if successful.
+ l.rootProcArgs.FDMap.DecRef()
+ }
+
+ ep.tg = l.k.GlobalInit()
+ if l.console {
+ ttyFile := l.rootProcArgs.FDMap.GetFile(0)
+ defer ttyFile.DecRef()
+ ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
+
+ // Set the foreground process group on the TTY to the global
+ // init process group, since that is what we are about to
+ // start running.
+ ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+ }
+
+ // Start signal forwarding only after an init process is created.
+ l.stopSignalForwarding = l.startSignalForwarding()
+
+ log.Infof("Process should have started...")
+ l.watchdog.Start()
+ return l.k.Start()
+}
+
+// createContainer creates a new container inside the sandbox.
+func (l *Loader) createContainer(cid string) error {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ eid := execID{cid: cid}
+ if _, ok := l.processes[eid]; ok {
+ return fmt.Errorf("container %q already exists", cid)
+ }
+ l.processes[eid] = &execProcess{}
+ return nil
+}
+
+// startContainer starts a child container. It returns the thread group ID of
+// the newly created process. Caller owns 'files' and may close them after
+// this method returns.
+func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+ // Create capabilities.
+ caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
+ if err != nil {
+ return fmt.Errorf("creating capabilities: %v", err)
+ }
+
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ eid := execID{cid: cid}
+ if _, ok := l.processes[eid]; !ok {
+ return fmt.Errorf("trying to start a deleted container %q", cid)
+ }
+
+ // Convert the spec's additional GIDs to KGIDs.
+ extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+ for _, GID := range spec.Process.User.AdditionalGids {
+ extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+ }
+
+ // Create credentials. We reuse the root user namespace because the
+ // sentry currently supports only 1 mount namespace, which is tied to a
+ // single user namespace. Thus we must run in the same user namespace
+ // to access mounts.
+ // TODO(b/63601033): Create a new mount namespace for the container.
+ creds := auth.NewUserCredentials(
+ auth.KUID(spec.Process.User.UID),
+ auth.KGID(spec.Process.User.GID),
+ extraKGIDs,
+ caps,
+ l.k.RootUserNamespace())
+
+ procArgs, err := newProcess(cid, spec, creds, l.k)
+ if err != nil {
+ return fmt.Errorf("creating new process: %v", err)
+ }
+
+ // setupContainerFS() dups stdioFDs, so we don't need to dup them here.
+ var stdioFDs []int
+ for _, f := range files[:3] {
+ stdioFDs = append(stdioFDs, int(f.Fd()))
+ }
+
+ // Can't take ownership away from os.File. dup them to get a new FDs.
+ var goferFDs []int
+ for _, f := range files[3:] {
+ fd, err := syscall.Dup(int(f.Fd()))
+ if err != nil {
+ return fmt.Errorf("failed to dup file: %v", err)
+ }
+ goferFDs = append(goferFDs, fd)
+ }
+
+ if err := setupContainerFS(
+ &procArgs,
+ spec,
+ conf,
+ stdioFDs,
+ goferFDs,
+ false,
+ creds,
+ procArgs.Limits,
+ k,
+ cid); err != nil {
+ return fmt.Errorf("configuring container FS: %v", err)
+ }
+
+ ctx := procArgs.NewContext(l.k)
+ mns := k.RootMountNamespace()
+ if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
+ return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
+ }
+
+ // Create and start the new process.
+ tg, _, err := l.k.CreateProcess(procArgs)
+ if err != nil {
+ return fmt.Errorf("creating process: %v", err)
+ }
+ l.k.StartProcess(tg)
+
+ // CreateProcess takes a reference on FDMap if successful.
+ procArgs.FDMap.DecRef()
+
+ l.processes[eid].tg = tg
+ return nil
+}
+
+// destroyContainer stops a container if it is still running and cleans up its
+// filesystem.
+func (l *Loader) destroyContainer(cid string) error {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ // Has the container started?
+ if _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}); err == nil {
+ // If the container has started, kill and wait for all processes.
+ if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+ return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
+ }
+ }
+
+ // Remove all container thread groups from the map.
+ for key := range l.processes {
+ if key.cid == cid {
+ delete(l.processes, key)
+ }
+ }
+
+ ctx := l.rootProcArgs.NewContext(l.k)
+ if err := destroyContainerFS(ctx, cid, l.k); err != nil {
+ return fmt.Errorf("destroying filesystem for container %q: %v", cid, err)
+ }
+
+ // We made it!
+ log.Debugf("Container destroyed %q", cid)
+ return nil
+}
+
+func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
+ // Hold the lock for the entire operation to ensure that exec'd process is
+ // added to 'processes' in case it races with destroyContainer().
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ tg, _, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
+ if err != nil {
+ return 0, fmt.Errorf("no such container: %q", args.ContainerID)
+ }
+
+ // Get the container Root Dirent from the Task, since we must run this
+ // process with the same Root.
+ tg.Leader().WithMuLocked(func(t *kernel.Task) {
+ args.Root = t.FSContext().RootDirectory()
+ })
+ if args.Root != nil {
+ defer args.Root.DecRef()
+ }
+
+ // Start the process.
+ proc := control.Proc{Kernel: l.k}
+ newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
+ if err != nil {
+ return 0, err
+ }
+
+ eid := execID{cid: args.ContainerID, pid: tgid}
+ l.processes[eid] = &execProcess{
+ tg: newTG,
+ tty: ttyFile,
+ }
+ log.Debugf("updated processes: %v", l.processes)
+
+ return tgid, nil
+}
+
+// waitContainer waits for the init process of a container to exit.
+func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
+ // Don't defer unlock, as doing so would make it impossible for
+ // multiple clients to wait on the same container.
+ tg, _, err := l.threadGroupFromID(execID{cid: cid})
+ if err != nil {
+ return fmt.Errorf("can't wait for container %q: %v", cid, err)
+ }
+
+ // If the thread either has already exited or exits during waiting,
+ // consider the container exited.
+ ws := l.wait(tg)
+ *waitStatus = ws
+ return nil
+}
+
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
+ if tgid <= 0 {
+ return fmt.Errorf("PID (%d) must be positive", tgid)
+ }
+
+ // Try to find a process that was exec'd
+ eid := execID{cid: cid, pid: tgid}
+ execTG, _, err := l.threadGroupFromID(eid)
+ if err == nil {
+ ws := l.wait(execTG)
+ *waitStatus = ws
+
+ // Remove tg from the cache if caller requested it.
+ if clearStatus {
+ l.mu.Lock()
+ delete(l.processes, eid)
+ log.Debugf("updated processes (removal): %v", l.processes)
+ l.mu.Unlock()
+ }
+ return nil
+ }
+
+ // The caller may be waiting on a process not started directly via exec.
+ // In this case, find the process in the container's PID namespace.
+ initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+ if err != nil {
+ return fmt.Errorf("waiting for PID %d: %v", tgid, err)
+ }
+ tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+ if tg == nil {
+ return fmt.Errorf("waiting for PID %d: no such process", tgid)
+ }
+ if tg.Leader().ContainerID() != cid {
+ return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+ }
+ ws := l.wait(tg)
+ *waitStatus = ws
+ return nil
+}
+
+// wait waits for the process with TGID 'tgid' in a container's PID namespace
+// to exit.
+func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
+ tg.WaitExited()
+ return tg.ExitStatus().Status()
+}
+
+// WaitForStartSignal waits for a start signal from the control server.
+func (l *Loader) WaitForStartSignal() {
+ <-l.ctrl.manager.startChan
+}
+
+// WaitExit waits for the root container to exit, and returns its exit status.
+func (l *Loader) WaitExit() kernel.ExitStatus {
+ // Wait for container.
+ l.k.WaitExited()
+
+ return l.k.GlobalInit().ExitStatus()
+}
+
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
+ switch conf.Network {
+ case NetworkHost:
+ return hostinet.NewStack(), nil
+
+ case NetworkNone, NetworkSandbox:
+ // NetworkNone sets up loopback using netstack.
+ netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
+ protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4}
+ s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
+ Clock: clock,
+ Stats: epsocket.Metrics,
+ HandleLocal: true,
+ // Enable raw sockets for users with sufficient
+ // privileges.
+ Raw: true,
+ })}
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+ return nil, fmt.Errorf("failed to enable SACK: %v", err)
+ }
+ return &s, nil
+
+ default:
+ panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+ }
+}
+
+// signal sends a signal to one or more processes in a container. If PID is 0,
+// then the container init process is used. Depending on the SignalDeliveryMode
+// option, the signal may be sent directly to the indicated process, to all
+// processes in the container, or to the foreground process group.
+func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
+ if pid < 0 {
+ return fmt.Errorf("PID (%d) must be positive", pid)
+ }
+
+ switch mode {
+ case DeliverToProcess:
+ if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
+ return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err)
+ }
+ return nil
+
+ case DeliverToForegroundProcessGroup:
+ if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
+ return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err)
+ }
+ return nil
+
+ case DeliverToAllProcesses:
+ if pid != 0 {
+ return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
+ }
+ // Check that the container has actually started before signaling it.
+ _, _, err := l.threadGroupFromID(execID{cid: cid})
+ if err != nil {
+ return err
+ }
+ if err := l.signalAllProcesses(cid, signo); err != nil {
+ return fmt.Errorf("signaling all processes in container %q: %v", cid, err)
+ }
+ return nil
+
+ default:
+ panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
+ }
+}
+
+func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
+ execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+ if err == nil {
+ // Send signal directly to the identified process.
+ return execTG.SendSignal(&arch.SignalInfo{Signo: signo})
+ }
+
+ // The caller may be signaling a process not started directly via exec.
+ // In this case, find the process in the container's PID namespace and
+ // signal it.
+ initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+ if err != nil {
+ return fmt.Errorf("no thread group found: %v", err)
+ }
+ tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+ if tg == nil {
+ return fmt.Errorf("no such process with PID %d", tgid)
+ }
+ if tg.Leader().ContainerID() != cid {
+ return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+ }
+ return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+}
+
+func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
+ // Lookup foreground process group from the TTY for the given process,
+ // and send the signal to it.
+ tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+ if err != nil {
+ return fmt.Errorf("no thread group found: %v", err)
+ }
+ if tty == nil {
+ return fmt.Errorf("no TTY attached")
+ }
+ pg := tty.ForegroundProcessGroup()
+ if pg == nil {
+ // No foreground process group has been set. Signal the
+ // original thread group.
+ log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
+ return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+ }
+ // Send the signal to all processes in the process group.
+ var lastErr error
+ for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+ if tg.ProcessGroup() != pg {
+ continue
+ }
+ if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+ lastErr = err
+ }
+ }
+ return lastErr
+}
+
+// signalAllProcesses that belong to specified container. It's a noop if the
+// container hasn't started or has exited.
+func (l *Loader) signalAllProcesses(cid string, signo int32) error {
+ // Pause the kernel to prevent new processes from being created while
+ // the signal is delivered. This prevents process leaks when SIGKILL is
+ // sent to the entire container.
+ l.k.Pause()
+ if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
+ l.k.Unpause()
+ return err
+ }
+ l.k.Unpause()
+
+ // If SIGKILLing all processes, wait for them to exit.
+ if linux.Signal(signo) == linux.SIGKILL {
+ for _, t := range l.k.TaskSet().Root.Tasks() {
+ if t.ContainerID() == cid {
+ t.ThreadGroup().WaitExited()
+ }
+ }
+ }
+ return nil
+}
+
+// threadGroupFromID same as threadGroupFromIDLocked except that it acquires
+// mutex before calling it.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+ l.mu.Lock()
+ defer l.mu.Unlock()
+ return l.threadGroupFromIDLocked(key)
+}
+
+// threadGroupFromIDLocked returns the thread group and TTY for the given
+// execution ID. TTY may be nil if the process is not attached to a terminal.
+// Returns error if execution ID is invalid or if container/process has not
+// started yet. Caller must hold 'mu'.
+func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+ ep := l.processes[key]
+ if ep == nil {
+ return nil, nil, fmt.Errorf("container not found")
+ }
+ if ep.tg == nil {
+ return nil, nil, fmt.Errorf("container not started")
+ }
+ return ep.tg, ep.tty, nil
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
new file mode 100644
index 000000000..0a154d90b
--- /dev/null
+++ b/runsc/boot/network.go
@@ -0,0 +1,222 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "net"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// Network exposes methods that can be used to configure a network stack.
+type Network struct {
+ Stack *stack.Stack
+}
+
+// Route represents a route in the network stack.
+type Route struct {
+ Destination net.IP
+ Mask net.IPMask
+ Gateway net.IP
+}
+
+// DefaultRoute represents a catch all route to the default gateway.
+type DefaultRoute struct {
+ Route Route
+ Name string
+}
+
+// FDBasedLink configures an fd-based link.
+type FDBasedLink struct {
+ Name string
+ MTU int
+ Addresses []net.IP
+ Routes []Route
+ GSOMaxSize uint32
+ LinkAddress []byte
+}
+
+// LoopbackLink configures a loopback li nk.
+type LoopbackLink struct {
+ Name string
+ Addresses []net.IP
+ Routes []Route
+}
+
+// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
+type CreateLinksAndRoutesArgs struct {
+ // FilePayload contains the fds associated with the FDBasedLinks. The
+ // two slices must have the same length.
+ urpc.FilePayload
+
+ LoopbackLinks []LoopbackLink
+ FDBasedLinks []FDBasedLink
+
+ DefaultGateway DefaultRoute
+}
+
+// Empty returns true if route hasn't been set.
+func (r *Route) Empty() bool {
+ return r.Destination == nil && r.Mask == nil && r.Gateway == nil
+}
+
+func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
+ return tcpip.Route{
+ Destination: ipToAddress(r.Destination),
+ Gateway: ipToAddress(r.Gateway),
+ Mask: ipToAddressMask(net.IP(r.Mask)),
+ NIC: id,
+ }
+}
+
+// CreateLinksAndRoutes creates links and routes in a network stack. It should
+// only be called once.
+func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
+ if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
+ return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+ }
+
+ var nicID tcpip.NICID
+ nicids := make(map[string]tcpip.NICID)
+
+ // Collect routes from all links.
+ var routes []tcpip.Route
+
+ // Loopback normally appear before other interfaces.
+ for _, link := range args.LoopbackLinks {
+ nicID++
+ nicids[link.Name] = nicID
+
+ linkEP := loopback.New()
+
+ log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+ if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil {
+ return err
+ }
+
+ // Collect the routes from this link.
+ for _, r := range link.Routes {
+ routes = append(routes, r.toTcpipRoute(nicID))
+ }
+ }
+
+ for i, link := range args.FDBasedLinks {
+ nicID++
+ nicids[link.Name] = nicID
+
+ // Copy the underlying FD.
+ oldFD := args.FilePayload.Files[i].Fd()
+ newFD, err := syscall.Dup(int(oldFD))
+ if err != nil {
+ return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ }
+
+ mac := tcpip.LinkAddress(link.LinkAddress)
+ linkEP, err := fdbased.New(&fdbased.Options{
+ FD: newFD,
+ MTU: uint32(link.MTU),
+ EthernetHeader: true,
+ Address: mac,
+ PacketDispatchMode: fdbased.RecvMMsg,
+ GSOMaxSize: link.GSOMaxSize,
+ RXChecksumOffload: true,
+ })
+ if err != nil {
+ return err
+ }
+
+ log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
+ if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
+ return err
+ }
+
+ // Collect the routes from this link.
+ for _, r := range link.Routes {
+ routes = append(routes, r.toTcpipRoute(nicID))
+ }
+ }
+
+ if !args.DefaultGateway.Route.Empty() {
+ nicID, ok := nicids[args.DefaultGateway.Name]
+ if !ok {
+ return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+ }
+ routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID))
+ }
+
+ log.Infof("Setting routes %+v", routes)
+ n.Stack.SetRouteTable(routes)
+ return nil
+}
+
+// createNICWithAddrs creates a NIC in the network stack and adds the given
+// addresses.
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP, loopback bool) error {
+ if loopback {
+ if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(linkEP)); err != nil {
+ return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+ }
+ } else {
+ if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
+ return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+ }
+ }
+
+ // Always start with an arp address for the NIC.
+ if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+ return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
+ }
+
+ for _, addr := range addrs {
+ proto, tcpipAddr := ipToAddressAndProto(addr)
+ if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+ return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
+ }
+ }
+ return nil
+}
+
+// ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
+//
+// Note: don't use 'len(ip)' to determine IP version because length is always 16.
+func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
+ if i4 := ip.To4(); i4 != nil {
+ return ipv4.ProtocolNumber, tcpip.Address(i4)
+ }
+ return ipv6.ProtocolNumber, tcpip.Address(ip)
+}
+
+// ipToAddress converts IP to tcpip.Address, ignoring the protocol.
+func ipToAddress(ip net.IP) tcpip.Address {
+ _, addr := ipToAddressAndProto(ip)
+ return addr
+}
+
+// ipToAddressMask converts IP to tcpip.AddressMask, ignoring the protocol.
+func ipToAddressMask(ip net.IP) tcpip.AddressMask {
+ _, addr := ipToAddressAndProto(ip)
+ return tcpip.AddressMask(addr)
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
new file mode 100644
index 000000000..19c7f8fbd
--- /dev/null
+++ b/runsc/boot/strace.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+)
+
+func enableStrace(conf *Config) error {
+ // We must initialize even if strace is not enabled.
+ strace.Initialize()
+
+ if !conf.Strace {
+ return nil
+ }
+
+ max := conf.StraceLogSize
+ if max == 0 {
+ max = 1024
+ }
+ strace.LogMaximumSize = max
+
+ if len(conf.StraceSyscalls) == 0 {
+ strace.EnableAll(strace.SinkTypeLog)
+ return nil
+ }
+ return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+}