summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
Diffstat (limited to 'runsc')
-rw-r--r--runsc/BUILD17
-rw-r--r--runsc/boot/BUILD88
-rw-r--r--runsc/boot/capability.go120
-rw-r--r--runsc/boot/config.go162
-rw-r--r--runsc/boot/controller.go128
-rw-r--r--runsc/boot/events.go81
-rw-r--r--runsc/boot/fds.go61
-rw-r--r--runsc/boot/filter/BUILD26
-rw-r--r--runsc/boot/filter/config.go175
-rw-r--r--runsc/boot/filter/extra_filters.go24
-rw-r--r--runsc/boot/filter/extra_filters_msan.go30
-rw-r--r--runsc/boot/filter/extra_filters_race.go33
-rw-r--r--runsc/boot/filter/filter.go67
-rw-r--r--runsc/boot/fs.go441
-rw-r--r--runsc/boot/limits.go60
-rw-r--r--runsc/boot/loader.go354
-rw-r--r--runsc/boot/loader_test.go238
-rw-r--r--runsc/boot/network.go213
-rw-r--r--runsc/boot/strace.go40
-rw-r--r--runsc/cmd/BUILD58
-rw-r--r--runsc/cmd/boot.go161
-rw-r--r--runsc/cmd/cmd.go77
-rw-r--r--runsc/cmd/create.go93
-rw-r--r--runsc/cmd/delete.go74
-rw-r--r--runsc/cmd/events.go111
-rw-r--r--runsc/cmd/exec.go375
-rw-r--r--runsc/cmd/exec_test.go154
-rw-r--r--runsc/cmd/gofer.go134
-rw-r--r--runsc/cmd/kill.go142
-rw-r--r--runsc/cmd/list.go117
-rw-r--r--runsc/cmd/path.go38
-rw-r--r--runsc/cmd/ps.go86
-rw-r--r--runsc/cmd/run.go82
-rw-r--r--runsc/cmd/start.go64
-rw-r--r--runsc/cmd/state.go73
-rw-r--r--runsc/fsgofer/BUILD33
-rw-r--r--runsc/fsgofer/fsgofer.go937
-rw-r--r--runsc/fsgofer/fsgofer_test.go576
-rw-r--r--runsc/fsgofer/fsgofer_unsafe.go58
-rw-r--r--runsc/main.go199
-rw-r--r--runsc/sandbox/BUILD53
-rw-r--r--runsc/sandbox/console.go60
-rw-r--r--runsc/sandbox/hook.go111
-rw-r--r--runsc/sandbox/namespace.go204
-rw-r--r--runsc/sandbox/network.go348
-rw-r--r--runsc/sandbox/sandbox.go666
-rw-r--r--runsc/sandbox/sandbox_test.go649
-rw-r--r--runsc/sandbox/status.go56
-rw-r--r--runsc/specutils/BUILD18
-rw-r--r--runsc/specutils/specutils.go183
50 files changed, 8348 insertions, 0 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
new file mode 100644
index 000000000..3651c2d30
--- /dev/null
+++ b/runsc/BUILD
@@ -0,0 +1,17 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
+go_binary(
+ name = "runsc",
+ srcs = [
+ "main.go",
+ ],
+ pure = "on",
+ deps = [
+ "//pkg/log",
+ "//runsc/boot",
+ "//runsc/cmd",
+ "@com_github_google_subcommands//:go_default_library",
+ ],
+)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
new file mode 100644
index 000000000..88736cfa4
--- /dev/null
+++ b/runsc/boot/BUILD
@@ -0,0 +1,88 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "boot",
+ srcs = [
+ "capability.go",
+ "config.go",
+ "controller.go",
+ "events.go",
+ "fds.go",
+ "fs.go",
+ "limits.go",
+ "loader.go",
+ "network.go",
+ "strace.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
+ visibility = [
+ "//runsc:__subpackages__",
+ ],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/control/server",
+ "//pkg/cpuid",
+ "//pkg/log",
+ "//pkg/sentry/context",
+ "//pkg/sentry/control",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/dev",
+ "//pkg/sentry/fs/gofer",
+ "//pkg/sentry/fs/host",
+ "//pkg/sentry/fs/proc",
+ "//pkg/sentry/fs/ramfs",
+ "//pkg/sentry/fs/sys",
+ "//pkg/sentry/fs/tmpfs",
+ "//pkg/sentry/inet",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/kdefs",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/loader",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/platform/kvm",
+ "//pkg/sentry/platform/ptrace",
+ "//pkg/sentry/sighandling",
+ "//pkg/sentry/socket/epsocket",
+ "//pkg/sentry/socket/hostinet",
+ "//pkg/sentry/socket/netlink",
+ "//pkg/sentry/socket/netlink/route",
+ "//pkg/sentry/socket/unix",
+ "//pkg/sentry/strace",
+ "//pkg/sentry/syscalls/linux",
+ "//pkg/sentry/time",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/watchdog",
+ "//pkg/syserror",
+ "//pkg/tcpip",
+ "//pkg/tcpip/link/fdbased",
+ "//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/link/sniffer",
+ "//pkg/tcpip/network/arp",
+ "//pkg/tcpip/network/ipv4",
+ "//pkg/tcpip/network/ipv6",
+ "//pkg/tcpip/stack",
+ "//pkg/tcpip/transport/tcp",
+ "//pkg/tcpip/transport/udp",
+ "//pkg/urpc",
+ "//runsc/boot/filter",
+ "//runsc/specutils",
+ "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_syndtr_gocapability//capability:go_default_library",
+ ],
+)
+
+go_test(
+ name = "boot_test",
+ size = "small",
+ srcs = ["loader_test.go"],
+ embed = [":boot"],
+ deps = [
+ "//pkg/control/server",
+ "//pkg/log",
+ "//pkg/sentry/context/contexttest",
+ "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ ],
+)
diff --git a/runsc/boot/capability.go b/runsc/boot/capability.go
new file mode 100644
index 000000000..4c6a59245
--- /dev/null
+++ b/runsc/boot/capability.go
@@ -0,0 +1,120 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "os"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/syndtr/gocapability/capability"
+)
+
+// ApplyCaps applies the capabilities in the spec to the current thread.
+//
+// Note that it must be called with current thread locked.
+func ApplyCaps(conf *Config, caps *specs.LinuxCapabilities) error {
+ setter, err := capability.NewPid2(os.Getpid())
+ if err != nil {
+ return err
+ }
+
+ bounding, err := capsFromNames(caps.Bounding)
+ if err != nil {
+ return err
+ }
+ effective, err := capsFromNames(caps.Effective)
+ if err != nil {
+ return err
+ }
+ permitted, err := capsFromNames(caps.Permitted)
+ if err != nil {
+ return err
+ }
+ inheritable, err := capsFromNames(caps.Inheritable)
+ if err != nil {
+ return err
+ }
+ ambient, err := capsFromNames(caps.Ambient)
+ if err != nil {
+ return err
+ }
+
+ // Ptrace platform requires extra capabilities.
+ if conf.Platform == PlatformPtrace {
+ bounding = append(bounding, capability.CAP_SYS_PTRACE)
+ effective = append(effective, capability.CAP_SYS_PTRACE)
+ permitted = append(permitted, capability.CAP_SYS_PTRACE)
+ }
+
+ setter.Set(capability.BOUNDS, bounding...)
+ setter.Set(capability.PERMITTED, permitted...)
+ setter.Set(capability.INHERITABLE, inheritable...)
+ setter.Set(capability.EFFECTIVE, effective...)
+ setter.Set(capability.AMBIENT, ambient...)
+ return setter.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS)
+}
+
+func capsFromNames(names []string) ([]capability.Cap, error) {
+ var caps []capability.Cap
+ for _, name := range names {
+ cap, ok := capFromName[name]
+ if !ok {
+ return nil, fmt.Errorf("invalid capability %q", name)
+ }
+ caps = append(caps, cap)
+ }
+ return caps, nil
+}
+
+var capFromName = map[string]capability.Cap{
+ "CAP_CHOWN": capability.CAP_CHOWN,
+ "CAP_DAC_OVERRIDE": capability.CAP_DAC_OVERRIDE,
+ "CAP_DAC_READ_SEARCH": capability.CAP_DAC_READ_SEARCH,
+ "CAP_FOWNER": capability.CAP_FOWNER,
+ "CAP_FSETID": capability.CAP_FSETID,
+ "CAP_KILL": capability.CAP_KILL,
+ "CAP_SETGID": capability.CAP_SETGID,
+ "CAP_SETUID": capability.CAP_SETUID,
+ "CAP_SETPCAP": capability.CAP_SETPCAP,
+ "CAP_LINUX_IMMUTABLE": capability.CAP_LINUX_IMMUTABLE,
+ "CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE,
+ "CAP_NET_BROAD_CAST": capability.CAP_NET_BROADCAST,
+ "CAP_NET_ADMIN": capability.CAP_NET_ADMIN,
+ "CAP_NET_RAW": capability.CAP_NET_RAW,
+ "CAP_IPC_LOCK": capability.CAP_IPC_LOCK,
+ "CAP_IPC_OWNER": capability.CAP_IPC_OWNER,
+ "CAP_SYS_MODULE": capability.CAP_SYS_MODULE,
+ "CAP_SYS_RAWIO": capability.CAP_SYS_RAWIO,
+ "CAP_SYS_CHROOT": capability.CAP_SYS_CHROOT,
+ "CAP_SYS_PTRACE": capability.CAP_SYS_PTRACE,
+ "CAP_SYS_PACCT": capability.CAP_SYS_PACCT,
+ "CAP_SYS_ADMIN": capability.CAP_SYS_ADMIN,
+ "CAP_SYS_BOOT": capability.CAP_SYS_BOOT,
+ "CAP_SYS_NICE": capability.CAP_SYS_NICE,
+ "CAP_SYS_RESOURCE": capability.CAP_SYS_RESOURCE,
+ "CAP_SYS_TIME": capability.CAP_SYS_TIME,
+ "CAP_SYS_TTY_CONFIG": capability.CAP_SYS_TTY_CONFIG,
+ "CAP_MKNOD": capability.CAP_MKNOD,
+ "CAP_LEASE": capability.CAP_LEASE,
+ "CAP_AUDIT_WRITE": capability.CAP_AUDIT_WRITE,
+ "CAP_AUDIT_CONTROL": capability.CAP_AUDIT_CONTROL,
+ "CAP_SETFCAP": capability.CAP_SETFCAP,
+ "CAP_MAC_OVERRIDE": capability.CAP_MAC_OVERRIDE,
+ "CAP_MAC_ADMIN": capability.CAP_MAC_ADMIN,
+ "CAP_SYSLOG": capability.CAP_SYSLOG,
+ "CAP_WAKE_ALARM": capability.CAP_WAKE_ALARM,
+ "CAP_BLOCK_SUSPEND": capability.CAP_BLOCK_SUSPEND,
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
new file mode 100644
index 000000000..f3e33e89a
--- /dev/null
+++ b/runsc/boot/config.go
@@ -0,0 +1,162 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import "fmt"
+
+// PlatformType tells which platform to use.
+type PlatformType int
+
+const (
+ // Ptrace runs the sandbox with the ptrace platform.
+ PlatformPtrace PlatformType = iota
+
+ // KVM runs the sandbox with the KVM platform.
+ PlatformKVM
+)
+
+// MakePlatformType converts type from string.
+func MakePlatformType(s string) (PlatformType, error) {
+ switch s {
+ case "ptrace":
+ return PlatformPtrace, nil
+ case "kvm":
+ return PlatformKVM, nil
+ default:
+ return 0, fmt.Errorf("invalid platform type %q", s)
+ }
+}
+
+func (p PlatformType) String() string {
+ switch p {
+ case PlatformPtrace:
+ return "ptrace"
+ case PlatformKVM:
+ return "kvm"
+ default:
+ return fmt.Sprintf("unknown(%d)", p)
+ }
+}
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+ // FileAccessProxy sends IO requests to a Gofer process that validates the
+ // requests and forwards them to the host.
+ FileAccessProxy FileAccessType = iota
+
+ // FileAccessDirect connects the sandbox directly to the host filesystem.
+ FileAccessDirect
+)
+
+// MakeFileAccessType converts type from string.
+func MakeFileAccessType(s string) (FileAccessType, error) {
+ switch s {
+ case "proxy":
+ return FileAccessProxy, nil
+ case "direct":
+ return FileAccessDirect, nil
+ default:
+ return 0, fmt.Errorf("invalid file access type %q", s)
+ }
+}
+
+func (f FileAccessType) String() string {
+ switch f {
+ case FileAccessProxy:
+ return "proxy"
+ case FileAccessDirect:
+ return "direct"
+ default:
+ return fmt.Sprintf("unknown(%d)", f)
+ }
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+ // NetworkSandbox uses internal network stack, isolated from the host.
+ NetworkSandbox NetworkType = iota
+
+ // NetworkHost redirects network related syscalls to the host network.
+ NetworkHost
+
+ // NetworkNone sets up just loopback using netstack.
+ NetworkNone
+)
+
+// MakeNetworkType converts type from string.
+func MakeNetworkType(s string) (NetworkType, error) {
+ switch s {
+ case "sandbox":
+ return NetworkSandbox, nil
+ case "host":
+ return NetworkHost, nil
+ case "none":
+ return NetworkNone, nil
+ default:
+ return 0, fmt.Errorf("invalid network type %q", s)
+ }
+}
+
+func (n NetworkType) String() string {
+ switch n {
+ case NetworkSandbox:
+ return "sandbox"
+ case NetworkHost:
+ return "host"
+ case NetworkNone:
+ return "none"
+ default:
+ return fmt.Sprintf("unknown(%d)", n)
+ }
+}
+
+// Config holds configuration that is not part of the runtime spec.
+type Config struct {
+ // RootDir is the runtime root directory.
+ RootDir string
+
+ // FileAccess indicates how the filesystem is accessed.
+ FileAccess FileAccessType
+
+ // Overlay is whether to wrap the root filesystem in an overlay.
+ Overlay bool
+
+ // Network indicates what type of network to use.
+ Network NetworkType
+
+ // LogPackets indicates that all network packets should be logged.
+ LogPackets bool
+
+ // Platform is the platform to run on.
+ Platform PlatformType
+
+ // Strace indicates that strace should be enabled.
+ Strace bool
+
+ // StraceSyscalls is the set of syscalls to trace. If StraceEnable is
+ // true and this list is empty, then all syscalls will be traced.
+ StraceSyscalls []string
+
+ // StraceLogSize is the max size of data blobs to display.
+ StraceLogSize uint
+
+ // DisableSeccomp indicates whether seccomp syscall filters should be
+ // disabled. Pardon the double negation, but default to enabled is important.
+ DisableSeccomp bool
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
new file mode 100644
index 000000000..4d4ef7256
--- /dev/null
+++ b/runsc/boot/controller.go
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/control/server"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+)
+
+const (
+ // ApplicationStart is the URPC endpoint for starting a sandboxed app.
+ ApplicationStart = "application.Start"
+
+ // ApplicationProcesses is the URPC endpoint for getting the list of
+ // processes running in a sandbox.
+ ApplicationProcesses = "application.Processes"
+
+ // ApplicationExecute is the URPC endpoint for executing a command in a
+ // sandbox.
+ ApplicationExecute = "application.Execute"
+
+ // ApplicationEvent is the URPC endpoint for getting stats about the
+ // container used by "runsc events".
+ ApplicationEvent = "application.Event"
+
+ // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
+ // and routes in a network stack.
+ NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
+)
+
+// ControlSocketAddr generates an abstract unix socket name for the given id.
+func ControlSocketAddr(id string) string {
+ return fmt.Sprintf("\x00runsc-sandbox.%s", id)
+}
+
+// controller holds the control server, and is used for communication into the
+// sandbox.
+type controller struct {
+ // srv is the contorl server.
+ srv *server.Server
+
+ // app holds the application methods.
+ app *application
+}
+
+// newController creates a new controller and starts it listening.
+func newController(fd int, k *kernel.Kernel) (*controller, error) {
+ srv, err := server.CreateFromFD(fd)
+ if err != nil {
+ return nil, err
+ }
+
+ app := &application{
+ startChan: make(chan struct{}),
+ startResultChan: make(chan error, 1),
+ k: k,
+ }
+ srv.Register(app)
+
+ if eps, ok := k.NetworkStack().(*epsocket.Stack); ok {
+ net := &Network{
+ Stack: eps.Stack,
+ }
+ srv.Register(net)
+ }
+
+ if err := srv.StartServing(); err != nil {
+ return nil, err
+ }
+
+ return &controller{
+ srv: srv,
+ app: app,
+ }, nil
+}
+
+// application contains methods that control the sandboxed application.
+type application struct {
+ // startChan is used to signal when the application process should be
+ // started.
+ startChan chan struct{}
+
+ // startResultChan is used to signal when the application has started. Any
+ // errors encountered during startup will be sent to the channel. A nil value
+ // indicates success.
+ startResultChan chan error
+
+ // k is the emulated linux kernel on which the sandboxed
+ // application runs.
+ k *kernel.Kernel
+}
+
+// Start will start the application process.
+func (a *application) Start(_, _ *struct{}) error {
+ // Tell the application to start and wait for the result.
+ a.startChan <- struct{}{}
+ return <-a.startResultChan
+}
+
+// Processes retrieves information about processes running in the sandbox.
+func (a *application) Processes(_, out *[]*control.Process) error {
+ return control.Processes(a.k, out)
+}
+
+// Execute runs a command on a created or running sandbox.
+func (a *application) Execute(e *control.ExecArgs, waitStatus *uint32) error {
+ proc := control.Proc{Kernel: a.k}
+ if err := proc.Exec(e, waitStatus); err != nil {
+ return fmt.Errorf("error executing: %+v: %v", e, err)
+ }
+ return nil
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
new file mode 100644
index 000000000..ef6459b01
--- /dev/null
+++ b/runsc/boot/events.go
@@ -0,0 +1,81 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// Event struct for encoding the event data to JSON. Corresponds to runc's
+// main.event struct.
+type Event struct {
+ Type string `json:"type"`
+ ID string `json:"id"`
+ Data interface{} `json:"data,omitempty"`
+}
+
+// Stats is the runc specific stats structure for stability when encoding and
+// decoding stats.
+// TODO: Many fields aren't obtainable due to a lack of cgroups.
+type Stats struct {
+ Memory Memory `json:"memory"`
+ Pids Pids `json:"pids"`
+}
+
+// Pids contains stats on processes.
+type Pids struct {
+ Current uint64 `json:"current,omitempty"`
+ Limit uint64 `json:"limit,omitempty"`
+}
+
+// MemoryEntry contains stats on a kind of memory.
+type MemoryEntry struct {
+ Limit uint64 `json:"limit"`
+ Usage uint64 `json:"usage,omitempty"`
+ Max uint64 `json:"max,omitempty"`
+ Failcnt uint64 `json:"failcnt"`
+}
+
+// Memory contains stats on memory.
+type Memory struct {
+ Cache uint64 `json:"cache,omitempty"`
+ Usage MemoryEntry `json:"usage,omitempty"`
+ Swap MemoryEntry `json:"swap,omitempty"`
+ Kernel MemoryEntry `json:"kernel,omitempty"`
+ KernelTCP MemoryEntry `json:"kernelTCP,omitempty"`
+ Raw map[string]uint64 `json:"raw,omitempty"`
+}
+
+func (a *application) Event(_ *struct{}, out *Event) error {
+ stats := &Stats{}
+ stats.populateMemory(a.k)
+ stats.populatePIDs(a.k)
+ *out = Event{Type: "stats", Data: stats}
+ return nil
+}
+
+func (s *Stats) populateMemory(k *kernel.Kernel) {
+ mem := k.Platform.Memory()
+ mem.UpdateUsage()
+ _, totalUsage := usage.MemoryAccounting.Copy()
+ s.Memory.Usage = MemoryEntry{
+ Usage: totalUsage,
+ }
+}
+
+func (s *Stats) populatePIDs(k *kernel.Kernel) {
+ s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups()))
+}
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
new file mode 100644
index 000000000..0449e243d
--- /dev/null
+++ b/runsc/boot/fds.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// createFDMap creates an fd map that contains stdin, stdout, and stderr. If
+// console is true, then ioctl calls will be passed through to the host fd.
+//
+// TODO: We currently arn't passing any FDs in to the sandbox, so
+// there's not much else for this function to do. It will get more complicated
+// when gofers enter the picture. Also the LISTEN_FDS environment variable
+// allows passing arbitrary FDs to the sandbox, which we do not yet support.
+func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool) (*kernel.FDMap, error) {
+ fdm := k.NewFDMap()
+ defer fdm.DecRef()
+
+ // Maps sandbox fd to host fd.
+ fdMap := map[int]int{
+ 0: syscall.Stdin,
+ 1: syscall.Stdout,
+ 2: syscall.Stderr,
+ }
+ mounter := fs.FileOwnerFromContext(ctx)
+
+ for sfd, hfd := range fdMap {
+ file, err := host.ImportFile(ctx, hfd, mounter, console /* allow ioctls */)
+ if err != nil {
+ return nil, fmt.Errorf("failed to import fd %d: %v", hfd, err)
+ }
+ defer file.DecRef()
+ if err := fdm.NewFDAt(kdefs.FD(sfd), file, kernel.FDFlags{}, l); err != nil {
+ return nil, fmt.Errorf("failed to add imported fd %d to FDMap: %v", hfd, err)
+ }
+ }
+
+ fdm.IncRef()
+ return fdm, nil
+}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
new file mode 100644
index 000000000..fd1b18717
--- /dev/null
+++ b/runsc/boot/filter/BUILD
@@ -0,0 +1,26 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "filter",
+ srcs = [
+ "config.go",
+ "extra_filters.go",
+ "extra_filters_msan.go",
+ "extra_filters_race.go",
+ "filter.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/runsc/boot/filter",
+ visibility = [
+ "//runsc/boot:__subpackages__",
+ ],
+ deps = [
+ "//pkg/log",
+ "//pkg/seccomp",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/platform/kvm",
+ "//pkg/sentry/platform/ptrace",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
new file mode 100644
index 000000000..130e987df
--- /dev/null
+++ b/runsc/boot/filter/config.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+ "syscall"
+
+ "golang.org/x/sys/unix"
+)
+
+// allowedSyscalls is the set of syscalls executed by the Sentry
+// to the host OS.
+var allowedSyscalls = []uintptr{
+ syscall.SYS_ACCEPT,
+ syscall.SYS_ARCH_PRCTL,
+ syscall.SYS_CLOCK_GETTIME,
+ syscall.SYS_CLONE,
+ syscall.SYS_CLOSE,
+ syscall.SYS_DUP,
+ syscall.SYS_DUP2,
+ syscall.SYS_EPOLL_CREATE1,
+ syscall.SYS_EPOLL_CTL,
+ syscall.SYS_EPOLL_PWAIT,
+ syscall.SYS_EPOLL_WAIT,
+ syscall.SYS_EVENTFD2,
+ syscall.SYS_EXIT,
+ syscall.SYS_EXIT_GROUP,
+ syscall.SYS_FALLOCATE,
+ syscall.SYS_FCHMOD,
+ syscall.SYS_FCNTL,
+ syscall.SYS_FSTAT,
+ syscall.SYS_FSYNC,
+ syscall.SYS_FTRUNCATE,
+ syscall.SYS_FUTEX,
+ syscall.SYS_GETDENTS64,
+ syscall.SYS_GETPID,
+ unix.SYS_GETRANDOM,
+ syscall.SYS_GETSOCKOPT,
+ syscall.SYS_GETTID,
+ syscall.SYS_GETTIMEOFDAY,
+ syscall.SYS_LISTEN,
+ syscall.SYS_LSEEK,
+ syscall.SYS_MADVISE,
+ syscall.SYS_MINCORE,
+ syscall.SYS_MMAP,
+ syscall.SYS_MPROTECT,
+ syscall.SYS_MUNMAP,
+ syscall.SYS_NEWFSTATAT,
+ syscall.SYS_POLL,
+ syscall.SYS_PREAD64,
+ syscall.SYS_PSELECT6,
+ syscall.SYS_PWRITE64,
+ syscall.SYS_READ,
+ syscall.SYS_READLINKAT,
+ syscall.SYS_READV,
+ syscall.SYS_RECVMSG,
+ syscall.SYS_RENAMEAT,
+ syscall.SYS_RESTART_SYSCALL,
+ syscall.SYS_RT_SIGACTION,
+ syscall.SYS_RT_SIGPROCMASK,
+ syscall.SYS_RT_SIGRETURN,
+ syscall.SYS_SCHED_YIELD,
+ syscall.SYS_SENDMSG,
+ syscall.SYS_SETITIMER,
+ syscall.SYS_SHUTDOWN,
+ syscall.SYS_SIGALTSTACK,
+ syscall.SYS_SYNC_FILE_RANGE,
+ syscall.SYS_TGKILL,
+ syscall.SYS_UTIMENSAT,
+ syscall.SYS_WRITE,
+ syscall.SYS_WRITEV,
+}
+
+// TODO: Ioctl is needed in order to support tty consoles.
+// Once filters support argument-checking, we should only allow ioctl
+// with tty-related arguments.
+func consoleFilters() []uintptr {
+ return []uintptr{
+ syscall.SYS_IOCTL,
+ }
+}
+
+// whitelistFSFilters returns syscalls made by whitelistFS. Using WhitelistFS
+// is less secure because it runs inside the Sentry and must be able to perform
+// file operations that would otherwise be disabled by seccomp when a Gofer is
+// used. When whitelistFS is not used, openning new FD in the Sentry is
+// disallowed.
+func whitelistFSFilters() []uintptr {
+ return []uintptr{
+ syscall.SYS_ACCESS,
+ syscall.SYS_FCHMOD,
+ syscall.SYS_FSTAT,
+ syscall.SYS_FSYNC,
+ syscall.SYS_FTRUNCATE,
+ syscall.SYS_GETCWD,
+ syscall.SYS_GETDENTS,
+ syscall.SYS_GETDENTS64,
+ syscall.SYS_LSEEK,
+ syscall.SYS_LSTAT,
+ syscall.SYS_MKDIR,
+ syscall.SYS_MKDIRAT,
+ syscall.SYS_NEWFSTATAT,
+ syscall.SYS_OPEN,
+ syscall.SYS_OPENAT,
+ syscall.SYS_PREAD64,
+ syscall.SYS_PWRITE64,
+ syscall.SYS_READ,
+ syscall.SYS_READLINK,
+ syscall.SYS_READLINKAT,
+ syscall.SYS_RENAMEAT,
+ syscall.SYS_STAT,
+ syscall.SYS_SYMLINK,
+ syscall.SYS_SYMLINKAT,
+ syscall.SYS_SYNC_FILE_RANGE,
+ syscall.SYS_UNLINK,
+ syscall.SYS_UNLINKAT,
+ syscall.SYS_UTIMENSAT,
+ syscall.SYS_WRITE,
+ }
+}
+
+// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
+func hostInetFilters() []uintptr {
+ return []uintptr{
+ syscall.SYS_ACCEPT4,
+ syscall.SYS_BIND,
+ syscall.SYS_CONNECT,
+ syscall.SYS_GETPEERNAME,
+ syscall.SYS_GETSOCKNAME,
+ syscall.SYS_GETSOCKOPT,
+ syscall.SYS_IOCTL,
+ syscall.SYS_LISTEN,
+ syscall.SYS_READV,
+ syscall.SYS_RECVFROM,
+ syscall.SYS_RECVMSG,
+ syscall.SYS_SENDMSG,
+ syscall.SYS_SENDTO,
+ syscall.SYS_SETSOCKOPT,
+ syscall.SYS_SHUTDOWN,
+ syscall.SYS_SOCKET,
+ syscall.SYS_WRITEV,
+ }
+}
+
+// ptraceFilters returns syscalls made exclusively by the ptrace platform.
+func ptraceFilters() []uintptr {
+ return []uintptr{
+ syscall.SYS_PTRACE,
+ syscall.SYS_WAIT4,
+ unix.SYS_GETCPU,
+ unix.SYS_SCHED_SETAFFINITY,
+ }
+}
+
+// kvmFilters returns syscalls made exclusively by the KVM platform.
+func kvmFilters() []uintptr {
+ return []uintptr{
+ syscall.SYS_IOCTL,
+ syscall.SYS_RT_SIGSUSPEND,
+ syscall.SYS_RT_SIGTIMEDWAIT,
+ 0xffffffffffffffff, // KVM uses syscall -1 to transition to host.
+ }
+}
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
new file mode 100644
index 000000000..e10d9bf4c
--- /dev/null
+++ b/runsc/boot/filter/extra_filters.go
@@ -0,0 +1,24 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go intrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() []uintptr {
+ return nil
+}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
new file mode 100644
index 000000000..a862340f6
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+ "syscall"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() []uintptr {
+ Report("MSAN is enabled: syscall filters less restrictive!")
+ return []uintptr{
+ syscall.SYS_SCHED_GETAFFINITY,
+ syscall.SYS_SET_ROBUST_LIST,
+ }
+}
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
new file mode 100644
index 000000000..b0c74a58a
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+ "syscall"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() []uintptr {
+ Report("TSAN is enabled: syscall filters less restrictive!")
+ return []uintptr{
+ syscall.SYS_BRK,
+ syscall.SYS_MUNLOCK,
+ syscall.SYS_NANOSLEEP,
+ syscall.SYS_OPEN,
+ syscall.SYS_SET_ROBUST_LIST,
+ }
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
new file mode 100644
index 000000000..3ba56a318
--- /dev/null
+++ b/runsc/boot/filter/filter.go
@@ -0,0 +1,67 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the sandbox is allowed to make
+// to the host, and installs seccomp filters to prevent prohibited
+// syscalls in case it's compromised.
+package filter
+
+import (
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/seccomp"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+)
+
+// Install installs seccomp filters for based on the given platform.
+func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error {
+ s := allowedSyscalls
+
+ // Set of additional filters used by -race and -msan. Returns empty
+ // when not enabled.
+ s = append(s, instrumentationFilters()...)
+
+ if whitelistFS {
+ Report("direct file access allows unrestricted file access!")
+ s = append(s, whitelistFSFilters()...)
+ }
+ if console {
+ Report("console is enabled: syscall filters less restrictive!")
+ s = append(s, consoleFilters()...)
+ }
+ if hostNetwork {
+ Report("host networking enabled: syscall filters less restrictive!")
+ s = append(s, hostInetFilters()...)
+ }
+
+ switch p := p.(type) {
+ case *ptrace.PTrace:
+ s = append(s, ptraceFilters()...)
+ case *kvm.KVM:
+ s = append(s, kvmFilters()...)
+ default:
+ return fmt.Errorf("unknown platform type %T", p)
+ }
+
+ // TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
+ return seccomp.Install(s, false)
+}
+
+// Report writes a warning message to the log.
+func Report(msg string) {
+ log.Warningf("*** SECCOMP WARNING: %s", msg)
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
new file mode 100644
index 000000000..2073bd0b1
--- /dev/null
+++ b/runsc/boot/fs.go
@@ -0,0 +1,441 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+
+ // Include filesystem types that OCI spec might mount.
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type fdDispenser struct {
+ fds []int
+}
+
+func (f *fdDispenser) remove() int {
+ rv := f.fds[0]
+ f.fds = f.fds[1:]
+ return rv
+}
+
+func (f *fdDispenser) empty() bool {
+ return len(f.fds) == 0
+}
+
+// createMountNamespace creates a mount manager containing the root filesystem
+// and all mounts.
+func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
+ fds := &fdDispenser{fds: ioFDs}
+
+ // Create the MountNamespace from the root.
+ rootInode, err := createRootMount(ctx, spec, conf, fds)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create root overlay: %v", err)
+ }
+ mns, err := fs.NewMountNamespace(ctx, rootInode)
+ if err != nil {
+ return nil, fmt.Errorf("failed to construct MountNamespace: %v", err)
+ }
+
+ // Keep track of whether proc, sys, and tmp were mounted.
+ var procMounted, sysMounted, tmpMounted bool
+
+ // Mount all submounts from the spec.
+ for _, m := range spec.Mounts {
+ // OCI spec uses many different mounts for the things inside of '/dev'. We
+ // have a single mount at '/dev' that is always mounted, regardless of
+ // whether it was asked for, as the spec says we SHOULD.
+ if strings.HasPrefix(m.Destination, "/dev") {
+ log.Warningf("ignoring dev mount at %q", m.Destination)
+ continue
+ }
+ switch m.Destination {
+ case "/proc":
+ procMounted = true
+ case "/sys":
+ sysMounted = true
+ case "/tmp":
+ tmpMounted = true
+ }
+
+ if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil {
+ return nil, err
+ }
+ }
+
+ // Always mount /dev.
+ if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+ Type: "devtmpfs",
+ Destination: "/dev",
+ }); err != nil {
+ return nil, err
+ }
+
+ // Mount proc and sys even if the user did not ask for it, as the spec
+ // says we SHOULD.
+ if !procMounted {
+ if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+ Type: "proc",
+ Destination: "/proc",
+ }); err != nil {
+ return nil, err
+ }
+ }
+ if !sysMounted {
+ if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+ Type: "sysfs",
+ Destination: "/sys",
+ }); err != nil {
+ return nil, err
+ }
+ }
+
+ // Technically we don't have to mount tmpfs at /tmp, as we could just
+ // rely on the host /tmp, but this is a nice optimization, and fixes
+ // some apps that call mknod in /tmp.
+ if !tmpMounted {
+ if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+ Type: "tmpfs",
+ Destination: "/tmp",
+ }); err != nil {
+ return nil, err
+ }
+ }
+
+ if !fds.empty() {
+ return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
+ }
+
+ return mns, nil
+}
+
+// createRootMount creates the root filesystem.
+func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) {
+ // First construct the filesystem from the spec.Root.
+ mf := fs.MountSourceFlags{
+ ReadOnly: spec.Root.Readonly,
+ NoAtime: true,
+ }
+
+ var (
+ rootInode *fs.Inode
+ err error
+ )
+ switch conf.FileAccess {
+ case FileAccessProxy:
+ fd := fds.remove()
+ log.Infof("Mounting root over 9P, ioFD: %d", fd)
+ hostFS := mustFindFilesystem("9p")
+ rootInode, err = hostFS.Mount(ctx, "root", mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
+ if err != nil {
+ return nil, fmt.Errorf("failed to generate root mount point: %v", err)
+ }
+
+ case FileAccessDirect:
+ hostFS := mustFindFilesystem("whitelistfs")
+ rootInode, err = hostFS.Mount(ctx, "root", mf, "root="+spec.Root.Path+",dont_translate_ownership=true")
+ if err != nil {
+ return nil, fmt.Errorf("failed to generate root mount point: %v", err)
+ }
+
+ default:
+ return nil, fmt.Errorf("invalid file access type: %v", conf.FileAccess)
+ }
+
+ // We need to overlay the root on top of a ramfs with stub directories
+ // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
+ // mounted even if they are not in the spec.
+ submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp")
+ rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+ if err != nil {
+ return nil, fmt.Errorf("error adding submount overlay: %v", err)
+ }
+
+ if conf.Overlay {
+ log.Debugf("Adding overlay on top of root mount")
+ // Overlay a tmpfs filesystem on top of the root.
+ rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ log.Infof("Mounted %q to \"/\" type root", spec.Root.Path)
+ return rootInode, nil
+}
+
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+ // Upper layer uses the same flags as lower, but it must be read-write.
+ lowerFlags.ReadOnly = false
+
+ tmpFS := mustFindFilesystem("tmpfs")
+ upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
+ if err != nil {
+ return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err)
+ }
+ return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
+}
+
+func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ var data []string
+ var fsName string
+ var useOverlay bool
+ switch m.Type {
+ case "proc", "sysfs", "devtmpfs":
+ fsName = m.Type
+ case "none":
+ fsName = "sysfs"
+ case "tmpfs":
+ fsName = m.Type
+
+ // tmpfs has some extra supported options that we must pass through.
+ var err error
+ data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+ if err != nil {
+ return err
+ }
+ case "bind":
+ switch conf.FileAccess {
+ case FileAccessProxy:
+ fd := fds.remove()
+ fsName = "9p"
+ data = []string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}
+ case FileAccessDirect:
+ fsName = "whitelistfs"
+ data = []string{"root=" + m.Source, "dont_translate_ownership=true"}
+ default:
+ return fmt.Errorf("invalid file access type: %v", conf.FileAccess)
+ }
+
+ fi, err := os.Stat(m.Source)
+ if err != nil {
+ return err
+ }
+ // Add overlay to all writable mounts, except when mapping an individual file.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly && fi.Mode().IsDir()
+ default:
+ // TODO: Support all the mount types and make this a
+ // fatal error. Most applications will "just work" without
+ // them, so this is a warning for now.
+ // we do not support.
+ log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ return nil
+ }
+
+ // All filesystem names should have been mapped to something we know.
+ filesystem := mustFindFilesystem(fsName)
+
+ mf := mountFlags(m.Options)
+ if useOverlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ mf.ReadOnly = true
+ }
+ mf.NoAtime = true
+
+ inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ","))
+ if err != nil {
+ return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
+ }
+
+ // If there are submounts, we need to overlay the mount on top of a
+ // ramfs with stub directories for submount paths.
+ //
+ // We do not do this for /dev, since there will usually be submounts in
+ // the spec, but our devfs implementation contains all the necessary
+ // directories and files (well, most of them anyways).
+ if m.Destination != "/dev" {
+ submounts := subtargets(m.Destination, spec.Mounts)
+ if len(submounts) > 0 {
+ log.Infof("Adding submount overlay over %q", m.Destination)
+ inode, err = addSubmountOverlay(ctx, inode, submounts)
+ if err != nil {
+ return fmt.Errorf("error adding submount overlay: %v", err)
+ }
+ }
+ }
+
+ if useOverlay {
+ log.Debugf("Adding overlay on top of mount %q", m.Destination)
+ if inode, err = addOverlay(ctx, conf, inode, m.Type, mf); err != nil {
+ return err
+ }
+ }
+
+ root := mns.Root()
+ defer root.DecRef()
+ dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals)
+ if err != nil {
+ return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
+ }
+ defer dirent.DecRef()
+ if err := mns.Mount(ctx, dirent, inode); err != nil {
+ return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err)
+ }
+
+ log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+ return nil
+}
+
+func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
+ root := mns.Root()
+ defer root.DecRef()
+
+ // Starting at the root, walk the path.
+ parent := root
+ ps := strings.Split(filepath.Clean(path), string(filepath.Separator))
+ for i := 0; i < len(ps); i++ {
+ if ps[i] == "" {
+ // This will be case for the first and last element, if the path
+ // begins or ends with '/'. Note that we always treat the path as
+ // absolute, regardless of what the first character contains.
+ continue
+ }
+ d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit)
+ if err == syserror.ENOENT {
+ // If we encounter a path that does not exist, then
+ // create it.
+ if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil {
+ return fmt.Errorf("failed to create directory %q: %v", ps[i], err)
+ }
+ if d, err = parent.Walk(ctx, root, ps[i]); err != nil {
+ return fmt.Errorf("walk to %q failed: %v", ps[i], err)
+ }
+ } else if err != nil {
+ return fmt.Errorf("failed to find inode %q: %v", ps[i], err)
+ }
+ parent = d
+ }
+ return nil
+}
+
+// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
+// keys.
+func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
+ var out []string
+ for _, o := range opts {
+ kv := strings.Split(o, "=")
+ switch len(kv) {
+ case 1:
+ if contains(allowedKeys, o) {
+ out = append(out, o)
+ continue
+ }
+ log.Warningf("ignoring unsupported key %q", kv)
+ case 2:
+ if contains(allowedKeys, kv[0]) {
+ out = append(out, o)
+ continue
+ }
+ log.Warningf("ignoring unsupported key %q", kv[0])
+ default:
+ return nil, fmt.Errorf("invalid option %q", o)
+ }
+ }
+ return out, nil
+}
+
+func destinations(mounts []specs.Mount, extra ...string) []string {
+ var ds []string
+ for _, m := range mounts {
+ ds = append(ds, m.Destination)
+ }
+ return append(ds, extra...)
+}
+
+func mountFlags(opts []string) fs.MountSourceFlags {
+ mf := fs.MountSourceFlags{}
+ for _, o := range opts {
+ switch o {
+ case "ro":
+ mf.ReadOnly = true
+ case "noatime":
+ mf.NoAtime = true
+ default:
+ log.Warningf("ignorning unknown mount option %q", o)
+ }
+ }
+ return mf
+}
+
+func contains(strs []string, str string) bool {
+ for _, s := range strs {
+ if s == str {
+ return true
+ }
+ }
+ return false
+}
+
+func mustFindFilesystem(name string) fs.Filesystem {
+ fs, ok := fs.FindFilesystem(name)
+ if !ok {
+ panic(fmt.Sprintf("could not find filesystem %q", name))
+ }
+ return fs
+}
+
+// addSubmountOverlay overlays the inode over a ramfs tree containing the given
+// paths.
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+ // There is no real filesystem backing this ramfs tree, so we pass in
+ // "nil" here.
+ mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts)
+ if err != nil {
+ return nil, fmt.Errorf("error creating mount tree: %v", err)
+ }
+ overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+ if err != nil {
+ return nil, fmt.Errorf("failed to make mount overlay: %v", err)
+ }
+ return overlayInode, err
+}
+
+// subtargets takes a set of Mounts and returns only the targets that are
+// children of the given root. The returned paths are relative to the root.
+func subtargets(root string, mnts []specs.Mount) []string {
+ r := filepath.Clean(root)
+ var targets []string
+ for _, mnt := range mnts {
+ t := filepath.Clean(mnt.Destination)
+ if strings.HasPrefix(t, r) {
+ // Make the mnt path relative to the root path. If the
+ // result is empty, then mnt IS the root mount, not a
+ // submount. We don't want to include those.
+ if t := strings.TrimPrefix(t, r); t != "" {
+ targets = append(targets, t)
+ }
+ }
+ }
+ return targets
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
new file mode 100644
index 000000000..ea72de8e9
--- /dev/null
+++ b/runsc/boot/limits.go
@@ -0,0 +1,60 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// Mapping from linux resource names to limits.LimitType.
+var fromLinuxResource = map[string]limits.LimitType{
+ "RLIMIT_CPU": limits.CPU,
+ "RLIMIT_FSIZE": limits.FileSize,
+ "RLIMIT_DATA": limits.Data,
+ "RLIMIT_STACK": limits.Stack,
+ "RLIMIT_CORE": limits.Core,
+ "RLIMIT_RSS": limits.Rss,
+ "RLIMIT_NPROC": limits.ProcessCount,
+ "RLIMIT_NOFILE": limits.NumberOfFiles,
+ "RLIMIT_MEMLOCK": limits.MemoryPagesLocked,
+ "RLIMIT_AS": limits.AS,
+ "RLIMIT_LOCKS": limits.Locks,
+ "RLIMIT_SIGPENDING": limits.SignalsPending,
+ "RLIMIT_MSGQUEUE": limits.MessageQueueBytes,
+ "RLIMIT_NICE": limits.Nice,
+ "RLIMIT_RTPRIO": limits.RealTimePriority,
+ "RLIMIT_RTTIME": limits.Rttime,
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+ ls, err := limits.NewLinuxDistroLimitSet()
+ if err != nil {
+ return nil, err
+ }
+ for _, rl := range spec.Process.Rlimits {
+ lt, ok := fromLinuxResource[rl.Type]
+ if !ok {
+ return nil, fmt.Errorf("unknown resource %q", rl.Type)
+ }
+ ls.SetUnchecked(lt, limits.Limit{
+ Cur: rl.Soft,
+ Max: rl.Hard,
+ })
+ }
+ return ls, nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
new file mode 100644
index 000000000..a470cb054
--- /dev/null
+++ b/runsc/boot/loader.go
@@ -0,0 +1,354 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package boot loads the kernel and runs the application.
+package boot
+
+import (
+ "fmt"
+ "math/rand"
+ "sync/atomic"
+ "syscall"
+ gtime "time"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
+ slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
+ "gvisor.googlesource.com/gvisor/runsc/boot/filter"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+
+ // Include supported socket providers.
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+)
+
+// Loader keeps state needed to start the kernel and run the application.
+type Loader struct {
+ // k is the kernel.
+ k *kernel.Kernel
+
+ // ctrl is the control server.
+ ctrl *controller
+
+ conf *Config
+
+ // console is set to true if terminal is enabled.
+ console bool
+
+ watchdog *watchdog.Watchdog
+
+ // stopSignalForwarding disables forwarding of signals to the sandboxed
+ // app. It should be called when a sandbox is destroyed.
+ stopSignalForwarding func()
+
+ // procArgs refers to the initial application task.
+ procArgs kernel.CreateProcessArgs
+}
+
+func init() {
+ // Initialize the random number generator.
+ rand.Seed(gtime.Now().UnixNano())
+
+ // Register the global syscall table.
+ kernel.RegisterSyscallTable(slinux.AMD64)
+}
+
+// New initializes a new kernel loader configured by spec.
+func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) {
+ // Create kernel and platform.
+ p, err := createPlatform(conf)
+ if err != nil {
+ return nil, fmt.Errorf("error creating platform: %v", err)
+ }
+ k := &kernel.Kernel{
+ Platform: p,
+ }
+
+ // Create VDSO.
+ vdso, err := loader.PrepareVDSO(p)
+ if err != nil {
+ return nil, fmt.Errorf("error creating vdso: %v", err)
+ }
+
+ // Create timekeeper.
+ tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+ if err != nil {
+ return nil, fmt.Errorf("error creating timekeeper: %v", err)
+ }
+ tk.SetClocks(time.NewCalibratedClocks())
+
+ // Create initial limits.
+ ls, err := createLimitSet(spec)
+ if err != nil {
+ return nil, fmt.Errorf("error creating limits: %v", err)
+ }
+
+ // Create capabilities.
+ caps, err := specutils.Capabilities(spec.Process.Capabilities)
+ if err != nil {
+ return nil, fmt.Errorf("error creating capabilities: %v", err)
+ }
+
+ // Convert the spec's additional GIDs to KGIDs.
+ extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+ for _, GID := range spec.Process.User.AdditionalGids {
+ extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+ }
+
+ // Create credentials.
+ creds := auth.NewUserCredentials(
+ auth.KUID(spec.Process.User.UID),
+ auth.KGID(spec.Process.User.GID),
+ extraKGIDs,
+ caps,
+ auth.NewRootUserNamespace())
+ if err != nil {
+ return nil, fmt.Errorf("error creating credentials: %v", err)
+ }
+
+ // Create user namespace.
+ // TODO: Not clear what domain name should be here. It is
+ // not configurable from runtime spec.
+ utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
+
+ ipcns := kernel.NewIPCNamespace()
+
+ if err := enableStrace(conf); err != nil {
+ return nil, fmt.Errorf("failed to enable strace: %v", err)
+ }
+
+ // Get the executable path, which is a bit tricky because we have to
+ // inspect the environment PATH which is relative to the root path.
+ exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
+ if err != nil {
+ return nil, fmt.Errorf("error getting executable path: %v", err)
+ }
+
+ // Create the process arguments.
+ procArgs := kernel.CreateProcessArgs{
+ Filename: exec,
+ Argv: spec.Process.Args,
+ Envv: spec.Process.Env,
+ WorkingDirectory: spec.Process.Cwd,
+ Credentials: creds,
+ // Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so
+ // it must wait until we have a Kernel.
+ Umask: uint(syscall.Umask(0)),
+ Limits: ls,
+ MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+ UTSNamespace: utsns,
+ IPCNamespace: ipcns,
+ }
+
+ // Create an empty network stack because the network namespace may be empty at
+ // this point. Netns is configured before Run() is called. Netstack is
+ // configured using a control uRPC message. Host network is configured inside
+ // Run().
+ networkStack := newEmptyNetworkStack(conf)
+
+ // Initiate the Kernel object, which is required by the Context passed
+ // to createVFS in order to mount (among other things) procfs.
+ if err = k.Init(kernel.InitKernelArgs{
+ FeatureSet: cpuid.HostFeatureSet(),
+ Timekeeper: tk,
+ RootUserNamespace: creds.UserNamespace,
+ NetworkStack: networkStack,
+ ApplicationCores: 8,
+ Vdso: vdso,
+ RootUTSNamespace: utsns,
+ RootIPCNamespace: ipcns,
+ }); err != nil {
+ return nil, fmt.Errorf("error initializing kernel: %v", err)
+ }
+
+ // Turn on packet logging if enabled.
+ if conf.LogPackets {
+ log.Infof("Packet logging enabled")
+ atomic.StoreUint32(&sniffer.LogPackets, 1)
+ } else {
+ log.Infof("Packet logging disabled")
+ atomic.StoreUint32(&sniffer.LogPackets, 0)
+ }
+
+ // Create the control server using the provided FD.
+ //
+ // This must be done *after* we have initialized the kernel since the
+ // controller is used to configure the kernel's network stack.
+ //
+ // This should also be *before* we create the process, since a
+ // misconfigured process will cause an error, and we want the control
+ // server up before that so that we don't time out trying to connect to
+ // it.
+ ctrl, err := newController(controllerFD, k)
+ if err != nil {
+ return nil, fmt.Errorf("error creating control server: %v", err)
+ }
+
+ ctx := procArgs.NewContext(k)
+
+ // Create the virtual filesystem.
+ mm, err := createMountNamespace(ctx, spec, conf, ioFDs)
+ if err != nil {
+ return nil, fmt.Errorf("error creating mounts: %v", err)
+ }
+ k.SetRootMountNamespace(mm)
+
+ // Create the FD map, which will set stdin, stdout, and stderr. If console
+ // is true, then ioctl calls will be passed through to the host fd.
+ fdm, err := createFDMap(ctx, k, ls, console)
+ if err != nil {
+ return nil, fmt.Errorf("error importing fds: %v", err)
+ }
+
+ // CreateProcess takes a reference on FDMap if successful. We
+ // won't need ours either way.
+ procArgs.FDMap = fdm
+
+ // We don't care about child signals; some platforms can generate a
+ // tremendous number of useless ones (I'm looking at you, ptrace).
+ if err := sighandling.IgnoreChildStop(); err != nil {
+ return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+ }
+ // Ensure that most signals received in sentry context are forwarded to
+ // the emulated kernel.
+ stopSignalForwarding := sighandling.StartForwarding(k)
+
+ watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning)
+ return &Loader{
+ k: k,
+ ctrl: ctrl,
+ conf: conf,
+ console: console,
+ watchdog: watchdog,
+ stopSignalForwarding: stopSignalForwarding,
+ procArgs: procArgs,
+ }, nil
+}
+
+// Destroy cleans up all resources used by the loader.
+func (l *Loader) Destroy() {
+ if l.ctrl != nil {
+ // Shut down control server.
+ l.ctrl.srv.Stop()
+ }
+ l.stopSignalForwarding()
+ l.watchdog.Stop()
+}
+
+func createPlatform(conf *Config) (platform.Platform, error) {
+ switch conf.Platform {
+ case PlatformPtrace:
+ log.Infof("Platform: ptrace")
+ return ptrace.New()
+ case PlatformKVM:
+ log.Infof("Platform: kvm")
+ return kvm.New()
+ default:
+ return nil, fmt.Errorf("invalid platform %v", conf.Platform)
+ }
+}
+
+// Run runs the application.
+func (l *Loader) Run() error {
+ err := l.run()
+ l.ctrl.app.startResultChan <- err
+ return err
+}
+
+func (l *Loader) run() error {
+ if l.conf.Network == NetworkHost {
+ // Delay host network configuration to this point because network namespace
+ // is configured after the loader is created and before Run() is called.
+ log.Debugf("Configuring host network")
+ stack := l.k.NetworkStack().(*hostinet.Stack)
+ if err := stack.Configure(); err != nil {
+ return err
+ }
+ }
+
+ // Finally done with all configuration. Setup filters before user code
+ // is loaded.
+ if l.conf.DisableSeccomp {
+ filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+ } else {
+ whitelistFS := l.conf.FileAccess == FileAccessDirect
+ hostNet := l.conf.Network == NetworkHost
+ if err := filter.Install(l.k.Platform, whitelistFS, l.console, hostNet); err != nil {
+ return fmt.Errorf("Failed to install seccomp filters: %v", err)
+ }
+ }
+
+ // Create the initial application task.
+ if _, err := l.k.CreateProcess(l.procArgs); err != nil {
+ return fmt.Errorf("failed to create init process: %v", err)
+ }
+
+ // CreateProcess takes a reference on FDMap if successful.
+ l.procArgs.FDMap.DecRef()
+
+ l.watchdog.Start()
+ return l.k.Start()
+}
+
+// WaitForStartSignal waits for a start signal from the control server.
+func (l *Loader) WaitForStartSignal() {
+ <-l.ctrl.app.startChan
+}
+
+// WaitExit waits for the application to exit, and returns the application's
+// exit status.
+func (l *Loader) WaitExit() kernel.ExitStatus {
+ // Wait for application.
+ l.k.WaitExited()
+
+ return l.k.GlobalInit().ExitStatus()
+}
+
+func newEmptyNetworkStack(conf *Config) inet.Stack {
+ switch conf.Network {
+ case NetworkHost:
+ return hostinet.NewStack()
+
+ case NetworkNone, NetworkSandbox:
+ // NetworkNone sets up loopback using netstack.
+ netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
+ protoNames := []string{tcp.ProtocolName, udp.ProtocolName}
+ return &epsocket.Stack{stack.New(netProtos, protoNames)}
+
+ default:
+ panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+ }
+}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
new file mode 100644
index 000000000..2fc16b241
--- /dev/null
+++ b/runsc/boot/loader_test.go
@@ -0,0 +1,238 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "os"
+ "testing"
+ "time"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/control/server"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+)
+
+func init() {
+ log.SetLevel(log.Debug)
+}
+
+// testSpec returns a simple spec that can be used in tests.
+func testSpec() *specs.Spec {
+ return &specs.Spec{
+ // The host filesystem root is the sandbox root.
+ Root: &specs.Root{
+ Path: "/",
+ Readonly: true,
+ },
+ Process: &specs.Process{
+ Args: []string{"/bin/true"},
+ },
+ }
+}
+
+func createLoader() (*Loader, error) {
+ fd, err := server.CreateSocket(ControlSocketAddr("123"))
+ if err != nil {
+ return nil, err
+ }
+ conf := &Config{
+ RootDir: "unused_root_dir",
+ Network: NetworkNone,
+ FileAccess: FileAccessDirect,
+ DisableSeccomp: true,
+ }
+ return New(testSpec(), conf, fd, nil, false)
+}
+
+// TestRun runs a simple application in a sandbox and checks that it succeeds.
+func TestRun(t *testing.T) {
+ s, err := createLoader()
+ if err != nil {
+ t.Fatalf("error creating loader: %v", err)
+ }
+ defer s.Destroy()
+
+ // Run the application.
+ if err := s.Run(); err != nil {
+ t.Errorf("error running application: %v", err)
+ }
+
+ // Wait for the application to exit. It should succeed.
+ if status := s.WaitExit(); status.Code != 0 || status.Signo != 0 {
+ t.Errorf("application exited with status %+v, want 0", status)
+ }
+}
+
+// TestStartSignal tests that the controller Start message will cause
+// WaitForStartSignal to return.
+func TestStartSignal(t *testing.T) {
+ s, err := createLoader()
+ if err != nil {
+ t.Fatalf("error creating loader: %v", err)
+ }
+ defer s.Destroy()
+
+ // We aren't going to wait on this application, so the control server
+ // needs to be shut down manually.
+ defer s.ctrl.srv.Stop()
+
+ // Start a goroutine that calls WaitForStartSignal and writes to a
+ // channel when it returns.
+ waitFinished := make(chan struct{})
+ go func() {
+ s.WaitForStartSignal()
+ // Pretent that Run() executed and returned no error.
+ s.ctrl.app.startResultChan <- nil
+ waitFinished <- struct{}{}
+ }()
+
+ // Nothing has been written to the channel, so waitFinished should not
+ // return. Give it a little bit of time to make sure the goroutine has
+ // started.
+ select {
+ case <-waitFinished:
+ t.Errorf("WaitForStartSignal completed but it should not have")
+ case <-time.After(50 * time.Millisecond):
+ // OK.
+ }
+
+ // Trigger the control server Start method.
+ if err := s.ctrl.app.Start(nil, nil); err != nil {
+ t.Errorf("error calling Start: %v", err)
+ }
+
+ // Now WaitForStartSignal should return (within a short amount of
+ // time).
+ select {
+ case <-waitFinished:
+ // OK.
+ case <-time.After(50 * time.Millisecond):
+ t.Errorf("WaitForStartSignal did not complete but it should have")
+ }
+
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespace(t *testing.T) {
+ conf := &Config{
+ RootDir: "unused_root_dir",
+ FileAccess: FileAccessDirect,
+ DisableSeccomp: true,
+ }
+
+ testCases := []struct {
+ name string
+ // Spec that will be used to create the mount manager. Note
+ // that we can't mount procfs without a kernel, so each spec
+ // MUST contain something other than procfs mounted at /proc.
+ spec specs.Spec
+ // Paths that are expected to exist in the resulting fs.
+ expectedPaths []string
+ }{
+ {
+ // Only proc.
+ name: "only proc mount",
+ spec: specs.Spec{
+ Root: &specs.Root{
+ Path: os.TempDir(),
+ Readonly: true,
+ },
+ Mounts: []specs.Mount{
+ {
+ Destination: "/proc",
+ Type: "tmpfs",
+ },
+ },
+ },
+ // /proc, /dev, and /sys should always be mounted.
+ expectedPaths: []string{"/proc", "/dev", "/sys"},
+ },
+ {
+ // Mount at a deep path, with many components that do
+ // not exist in the root.
+ name: "deep mount path",
+ spec: specs.Spec{
+ Root: &specs.Root{
+ Path: os.TempDir(),
+ Readonly: true,
+ },
+ Mounts: []specs.Mount{
+ {
+ Destination: "/some/very/very/deep/path",
+ Type: "tmpfs",
+ },
+ {
+ Destination: "/proc",
+ Type: "tmpfs",
+ },
+ },
+ },
+ // /some/deep/path should be mounted, along with /proc,
+ // /dev, and /sys.
+ expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+ },
+ {
+ // Mounts are nested inside eachother.
+ name: "nested mounts",
+ spec: specs.Spec{
+ Root: &specs.Root{
+ Path: os.TempDir(),
+ Readonly: true,
+ },
+ Mounts: []specs.Mount{
+ {
+ Destination: "/proc",
+ Type: "tmpfs",
+ },
+ {
+ Destination: "/foo",
+ Type: "tmpfs",
+ },
+ {
+ Destination: "/foo/bar",
+ Type: "tmpfs",
+ },
+ {
+ Destination: "/foo/bar/baz",
+ Type: "tmpfs",
+ },
+ {
+ // A deep path that is in foo but not the other mounts.
+ Destination: "/foo/some/very/very/deep/path",
+ Type: "tmpfs",
+ },
+ },
+ },
+ expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+ },
+ }
+
+ for _, tc := range testCases {
+ ctx := contexttest.Context(t)
+ mm, err := createMountNamespace(ctx, &tc.spec, conf, nil)
+ if err != nil {
+ t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
+ }
+ defer mm.DecRef()
+ root := mm.Root()
+ defer root.DecRef()
+ for _, p := range tc.expectedPaths {
+ if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
+ t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+ }
+ }
+ }
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
new file mode 100644
index 000000000..d2b52c823
--- /dev/null
+++ b/runsc/boot/network.go
@@ -0,0 +1,213 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "math/rand"
+ "net"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// Network exposes methods that can be used to configure a network stack.
+type Network struct {
+ Stack *stack.Stack
+}
+
+// Route represents a route in the network stack.
+type Route struct {
+ Destination net.IP
+ Mask net.IPMask
+ Gateway net.IP
+}
+
+// DefaultRoute represents a catch all route to the default gateway.
+type DefaultRoute struct {
+ Route Route
+ Name string
+}
+
+// FDBasedLink configures an fd-based link.
+type FDBasedLink struct {
+ Name string
+ MTU int
+ Addresses []net.IP
+ Routes []Route
+}
+
+// LoopbackLink configures a loopback li nk.
+type LoopbackLink struct {
+ Name string
+ Addresses []net.IP
+ Routes []Route
+}
+
+// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
+type CreateLinksAndRoutesArgs struct {
+ // FilePayload contains the fds associated with the FDBasedLinks. The
+ // two slices must have the same length.
+ urpc.FilePayload
+
+ LoopbackLinks []LoopbackLink
+ FDBasedLinks []FDBasedLink
+
+ DefaultGateway DefaultRoute
+}
+
+// Empty returns true if route hasn't been set.
+func (r *Route) Empty() bool {
+ return r.Destination == nil && r.Mask == nil && r.Gateway == nil
+}
+
+func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
+ return tcpip.Route{
+ Destination: ipToAddress(r.Destination),
+ Gateway: ipToAddress(r.Gateway),
+ Mask: ipToAddress(net.IP(r.Mask)),
+ NIC: id,
+ }
+}
+
+// CreateLinksAndRoutes creates links and routes in a network stack. It should
+// only be called once.
+func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
+ if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
+ return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+ }
+
+ var nicID tcpip.NICID
+ nicids := make(map[string]tcpip.NICID)
+
+ // Collect routes from all links.
+ var routes []tcpip.Route
+
+ // Loopback normally appear before other interfaces.
+ for _, link := range args.LoopbackLinks {
+ nicID++
+ nicids[link.Name] = nicID
+
+ linkEP := loopback.New()
+
+ log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+ if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+ return err
+ }
+
+ // Collect the routes from this link.
+ for _, r := range link.Routes {
+ routes = append(routes, r.toTcpipRoute(nicID))
+ }
+ }
+
+ for i, link := range args.FDBasedLinks {
+ nicID++
+ nicids[link.Name] = nicID
+
+ // Copy the underlying FD.
+ oldFD := args.FilePayload.Files[i].Fd()
+ newFD, err := syscall.Dup(int(oldFD))
+ if err != nil {
+ return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ }
+
+ linkEP := fdbased.New(&fdbased.Options{
+ FD: newFD,
+ MTU: uint32(link.MTU),
+ ChecksumOffload: false,
+ EthernetHeader: true,
+ Address: tcpip.LinkAddress(generateRndMac()),
+ })
+
+ log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+ if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+ return err
+ }
+
+ // Collect the routes from this link.
+ for _, r := range link.Routes {
+ routes = append(routes, r.toTcpipRoute(nicID))
+ }
+ }
+
+ if !args.DefaultGateway.Route.Empty() {
+ nicID, ok := nicids[args.DefaultGateway.Name]
+ if !ok {
+ return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+ }
+ routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID))
+ }
+
+ log.Infof("Setting routes %+v", routes)
+ n.Stack.SetRouteTable(routes)
+ return nil
+}
+
+// createNICWithAddrs creates a NIC in the network stack and adds the given
+// addresses.
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP) error {
+ if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
+ return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+ }
+
+ // Always start with an arp address for the NIC.
+ if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+ return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
+ }
+
+ for _, addr := range addrs {
+ proto, tcpipAddr := ipToAddressAndProto(addr)
+ if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+ return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
+ }
+ }
+ return nil
+}
+
+// ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
+//
+// Note: don't use 'len(ip)' to determine IP version because length is always 16.
+func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
+ if i4 := ip.To4(); i4 != nil {
+ return ipv4.ProtocolNumber, tcpip.Address(i4)
+ }
+ return ipv6.ProtocolNumber, tcpip.Address(ip)
+}
+
+// ipToAddress converts IP to tcpip.Address, ignoring the protocol.
+func ipToAddress(ip net.IP) tcpip.Address {
+ _, addr := ipToAddressAndProto(ip)
+ return addr
+}
+
+// generateRndMac returns a random local MAC address.
+// Copied from eth_random_addr() (include/linux/etherdevice.h)
+func generateRndMac() net.HardwareAddr {
+ mac := make(net.HardwareAddr, 6)
+ rand.Read(mac)
+ mac[0] &^= 0x1 // clear multicast bit
+ mac[0] |= 0x2 // set local assignment bit (IEEE802)
+ return mac
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
new file mode 100644
index 000000000..1e898672b
--- /dev/null
+++ b/runsc/boot/strace.go
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+)
+
+func enableStrace(conf *Config) error {
+ // We must initialize even if strace is not enabled.
+ strace.Initialize()
+
+ if !conf.Strace {
+ return nil
+ }
+
+ max := conf.StraceLogSize
+ if max == 0 {
+ max = 1024
+ }
+ strace.LogMaximumSize = max
+
+ if len(conf.StraceSyscalls) == 0 {
+ strace.EnableAll(strace.SinkTypeLog)
+ return nil
+ }
+ return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
new file mode 100644
index 000000000..128c8f7e6
--- /dev/null
+++ b/runsc/cmd/BUILD
@@ -0,0 +1,58 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "cmd",
+ srcs = [
+ "boot.go",
+ "cmd.go",
+ "create.go",
+ "delete.go",
+ "events.go",
+ "exec.go",
+ "gofer.go",
+ "kill.go",
+ "list.go",
+ "path.go",
+ "ps.go",
+ "run.go",
+ "start.go",
+ "state.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/runsc/cmd",
+ visibility = [
+ "//runsc:__subpackages__",
+ ],
+ deps = [
+ "//pkg/log",
+ "//pkg/p9",
+ "//pkg/sentry/control",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/unet",
+ "//pkg/urpc",
+ "//runsc/boot",
+ "//runsc/fsgofer",
+ "//runsc/sandbox",
+ "//runsc/specutils",
+ "@com_github_google_subcommands//:go_default_library",
+ "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+go_test(
+ name = "cmd_test",
+ size = "small",
+ srcs = ["exec_test.go"],
+ embed = [":cmd"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/control",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/urpc",
+ "@com_github_google_go-cmp//cmp:go_default_library",
+ "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+ "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ ],
+)
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
new file mode 100644
index 000000000..0dad6da79
--- /dev/null
+++ b/runsc/cmd/boot.go
@@ -0,0 +1,161 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "os"
+ "runtime"
+ "runtime/debug"
+ "strings"
+ "syscall"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Boot implements subcommands.Command for the "boot" command which starts a
+// new sandbox. It should not be called directly.
+type Boot struct {
+ // bundleDir is the path to the bundle directory.
+ bundleDir string
+
+ // controllerFD is the file descriptor of a stream socket for the
+ // control server that is donated to this process.
+ controllerFD int
+
+ // ioFDs is the list of FDs used to connect to FS gofers.
+ ioFDs intFlags
+
+ // console is set to true if the sandbox should allow terminal ioctl(2)
+ // syscalls.
+ console bool
+
+ // applyCaps determines if capabilities defined in the spec should be applied
+ // to the process.
+ applyCaps bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Boot) Name() string {
+ return "boot"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Boot) Synopsis() string {
+ return "launch a sandbox process (internal use only)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Boot) Usage() string {
+ return `boot [flags]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (b *Boot) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
+ f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+ f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
+ f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
+ f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+}
+
+// Execute implements subcommands.Command.Execute. It starts a sandbox in a
+// waiting state.
+func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if b.bundleDir == "" || b.controllerFD == -1 || f.NArg() != 0 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ // Ensure that if there is a panic, all goroutine stacks are printed.
+ debug.SetTraceback("all")
+
+ // Get the spec from the bundleDir.
+ spec, err := specutils.ReadSpec(b.bundleDir)
+ if err != nil {
+ Fatalf("error reading spec: %v", err)
+ }
+ specutils.LogSpec(spec)
+
+ // Turn any relative paths in the spec to absolute by prepending the bundleDir.
+ spec.Root.Path = absPath(b.bundleDir, spec.Root.Path)
+ for _, m := range spec.Mounts {
+ if m.Source != "" {
+ m.Source = absPath(b.bundleDir, m.Source)
+ }
+ }
+
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ if b.applyCaps {
+ setCapsAndCallSelf(conf, spec)
+ Fatalf("setCapsAndCallSelf must never return")
+ }
+
+ // Create the loader.
+ s, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
+ if err != nil {
+ Fatalf("error creating loader: %v", err)
+ }
+ defer s.Destroy()
+
+ // Wait for the start signal from runsc.
+ s.WaitForStartSignal()
+
+ // Run the application and wait for it to finish.
+ if err := s.Run(); err != nil {
+ Fatalf("error running sandbox: %v", err)
+ }
+
+ ws := s.WaitExit()
+ log.Infof("application exiting with %+v", ws)
+ *waitStatus = syscall.WaitStatus(ws.Status())
+ return subcommands.ExitSuccess
+}
+
+// setCapsAndCallSelf sets capabilities to the current thread and then execve's
+// itself again with the same arguments except '--apply-caps' to restart the
+// whole process with the desired capabilities.
+func setCapsAndCallSelf(conf *boot.Config, spec *specs.Spec) {
+ // Keep thread locked while capabilities are changed.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ if err := boot.ApplyCaps(conf, spec.Process.Capabilities); err != nil {
+ Fatalf("ApplyCaps, err: %v", err)
+ }
+ binPath, err := specutils.BinPath()
+ if err != nil {
+ Fatalf("%v", err)
+ }
+
+ // Remove --apply-caps arg to call myself.
+ var args []string
+ for _, arg := range os.Args {
+ if !strings.Contains(arg, "apply-caps") {
+ args = append(args, arg)
+ }
+ }
+
+ log.Infof("Execve 'boot' again, bye!")
+ log.Infof("%s %v", binPath, args)
+ syscall.Exec(binPath, args, []string{})
+}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
new file mode 100644
index 000000000..d4b834213
--- /dev/null
+++ b/runsc/cmd/cmd.go
@@ -0,0 +1,77 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cmd holds implementations of the runsc commands.
+package cmd
+
+import (
+ "fmt"
+ "os"
+ "strconv"
+
+ "flag"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// Fatalf logs to stderr and exits with a failure status code.
+func Fatalf(s string, args ...interface{}) {
+ // If runsc is being invoked by docker or cri-o, then we might not have
+ // access to stderr, so we log a serious-looking warning in addition to
+ // writing to stderr.
+ log.Warningf("FATAL ERROR: "+s, args...)
+ fmt.Fprintf(os.Stderr, s+"\n", args...)
+ // Return an error that is unlikely to be used by the application.
+ os.Exit(128)
+}
+
+// commandLineFlags returns a slice of all top-level command line flags that
+// have been set.
+func commandLineFlags() []string {
+ var args []string
+ flag.CommandLine.Visit(func(f *flag.Flag) {
+ args = append(args, fmt.Sprintf("--%s=%s", f.Name, f.Value.String()))
+ })
+ return args
+}
+
+// intFlags can be used with int flags that appear multiple times.
+type intFlags []int
+
+// String implements flag.Value.
+func (i *intFlags) String() string {
+ return fmt.Sprintf("%v", *i)
+}
+
+// Get implements flag.Value.
+func (i *intFlags) Get() interface{} {
+ return i
+}
+
+// GetArray returns array of FDs.
+func (i *intFlags) GetArray() []int {
+ return *i
+}
+
+// Set implements flag.Value.
+func (i *intFlags) Set(s string) error {
+ fd, err := strconv.Atoi(s)
+ if err != nil {
+ return fmt.Errorf("invalid flag value: %v", err)
+ }
+ if fd < 0 {
+ return fmt.Errorf("flag value must be greater than 0: %d", fd)
+ }
+ *i = append(*i, fd)
+ return nil
+}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
new file mode 100644
index 000000000..83cb09eb0
--- /dev/null
+++ b/runsc/cmd/create.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Create implements subcommands.Command for the "create" command.
+type Create struct {
+ // bundleDir is the path to the bundle directory (defaults to the
+ // current working directory).
+ bundleDir string
+
+ // pidFile is the filename that the sandbox pid will be written to.
+ // This file should only be created once the sandbox process is ready
+ // to use (i.e. control server has started and is listening).
+ pidFile string
+
+ // consoleSocket is the path to an AF_UNIX socket which will receive a
+ // file descriptor referencing the master end of the console's
+ // pseudoterminal. This is ignored unless spec.Process.Terminal is
+ // true.
+ consoleSocket string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Create) Name() string {
+ return "create"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Create) Synopsis() string {
+ return "create a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Create) Usage() string {
+ return `create [flags] <container id> - create a secure container
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Create) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+ f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+ f.StringVar(&c.pidFile, "pid-file", "", "filename that the sandbox pid will be written to")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ bundleDir := c.bundleDir
+ if bundleDir == "" {
+ bundleDir = getwdOrDie()
+ }
+ spec, err := specutils.ReadSpec(bundleDir)
+ if err != nil {
+ Fatalf("error reading spec: %v", err)
+ }
+ specutils.LogSpec(spec)
+
+ // Create the sandbox process, passing additional command line
+ // arguments to the sandbox process.
+ if _, err := sandbox.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, commandLineFlags()); err != nil {
+ Fatalf("error creating sandbox: %v", err)
+ }
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
new file mode 100644
index 000000000..a497c034d
--- /dev/null
+++ b/runsc/cmd/delete.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Delete implements subcommands.Command for the "delete" command.
+type Delete struct {
+ // force indicates that the sandbox should be terminated if running.
+ force bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Delete) Name() string {
+ return "delete"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Delete) Synopsis() string {
+ return "delete resources held by a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Delete) Usage() string {
+ return `delete [flags] <container ids>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (d *Delete) SetFlags(f *flag.FlagSet) {
+ f.BoolVar(&d.force, "force", false, "terminate sandbox if running")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() == 0 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ conf := args[0].(*boot.Config)
+
+ for i := 0; i < f.NArg(); i++ {
+ id := f.Arg(i)
+ s, err := sandbox.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("error loading sandbox %q: %v", id, err)
+ }
+ if !d.force && (s.Status == sandbox.Running) {
+ Fatalf("cannot stop running sandbox without --force flag")
+ }
+ if err := s.Destroy(); err != nil {
+ Fatalf("error destroying sandbox: %v", err)
+ }
+ }
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
new file mode 100644
index 000000000..afd42c2f2
--- /dev/null
+++ b/runsc/cmd/events.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "encoding/json"
+ "os"
+ "time"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Events implements subcommands.Command for the "events" command.
+type Events struct {
+ // The interval between stats reporting.
+ intervalSec int
+ // If true, events will print a single group of stats and exit.
+ stats bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Events) Name() string {
+ return "events"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Events) Synopsis() string {
+ return "display container events such as OOM notifications, cpu, memory, and IO usage statistics"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Events) Usage() string {
+ return `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+The events command displays information about the container. By default the
+information is displayed once every 5 seconds.
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (evs *Events) SetFlags(f *flag.FlagSet) {
+ f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds")
+ f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ s, err := sandbox.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("error loading sandox: %v", err)
+ }
+
+ // Repeatedly get stats from the container.
+ for {
+ // Get the event and print it as JSON.
+ ev, err := s.Event()
+ if err != nil {
+ log.Warningf("error getting events for sandbox: %v", err)
+ }
+ // err must be preserved because it is used below when breaking
+ // out of the loop.
+ b, err := json.Marshal(ev)
+ if err != nil {
+ log.Warningf("error while marshalling event %v: %v", ev, err)
+ } else {
+ os.Stdout.Write(b)
+ }
+
+ // If we're only running once, break. If we're only running
+ // once and there was an error, the command failed.
+ if evs.stats {
+ if err != nil {
+ return subcommands.ExitFailure
+ }
+ break
+ }
+
+ time.Sleep(time.Duration(evs.intervalSec) * time.Second)
+ }
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
new file mode 100644
index 000000000..8379f552d
--- /dev/null
+++ b/runsc/cmd/exec.go
@@ -0,0 +1,375 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Exec implements subcommands.Command for the "exec" command.
+type Exec struct {
+ cwd string
+ env stringSlice
+ // user contains the UID and GID with which to run the new process.
+ user user
+ extraKGIDs stringSlice
+ caps stringSlice
+ detach bool
+ processPath string
+ pidFile string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Exec) Name() string {
+ return "exec"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Exec) Synopsis() string {
+ return "execute new process inside the container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Exec) Usage() string {
+ return `exec [command options] <container-id> <command> [command options] || --process process.json <container-id>
+
+
+Where "<container-id>" is the name for the instance of the container and
+"<command>" is the command to be executed in the container.
+"<command>" can't be empty unless a "-process" flag provided.
+
+EXAMPLE:
+If the container is configured to run /bin/ps the following will
+output a list of processes running in the container:
+
+ # runc exec <container-id> ps
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ex *Exec) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&ex.cwd, "cwd", "", "current working directory")
+ f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')")
+ f.Var(&ex.user, "user", "UID (format: <uid>[:<gid>])")
+ f.Var(&ex.extraKGIDs, "additional-gids", "additional gids")
+ f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process")
+ f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
+ f.StringVar(&ex.processPath, "process", "", "path to the process.json")
+ f.StringVar(&ex.pidFile, "pid-file", "", "filename that the sandbox pid will be written to")
+}
+
+// Execute implements subcommands.Command.Execute. It starts a process in an
+// already created sandbox.
+func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ e, id, err := ex.parseArgs(f)
+ if err != nil {
+ Fatalf("error parsing process spec: %v", err)
+ }
+ e.Detach = ex.detach
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ s, err := sandbox.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("error loading sandox: %v", err)
+ }
+
+ if e.WorkingDirectory == "" {
+ e.WorkingDirectory = s.Spec.Process.Cwd
+ }
+
+ if e.Envv == nil {
+ e.Envv, err = resolveEnvs(s.Spec.Process.Env, ex.env)
+ if err != nil {
+ Fatalf("error getting environment variables: %v", err)
+ }
+ }
+
+ // containerd expects an actual process to represent the container being
+ // executed. If detach was specified, starts a child in non-detach mode,
+ // write the child's PID to the pid file. So when the container returns, the
+ // child process will also return and signal containerd.
+ if e.Detach {
+ binPath, err := specutils.BinPath()
+ if err != nil {
+ Fatalf("error getting bin path: %v", err)
+ }
+ var args []string
+ for _, a := range os.Args[1:] {
+ if !strings.Contains(a, "detach") {
+ args = append(args, a)
+ }
+ }
+ cmd := exec.Command(binPath, args...)
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ if err := cmd.Start(); err != nil {
+ Fatalf("failure to start child exec process, err: %v", err)
+ }
+
+ log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+
+ // Wait for PID file to ensure that child process has started. Otherwise,
+ // '--process' file is deleted as soon as this process returns and the child
+ // may fail to read it.
+ sleepTime := 10 * time.Millisecond
+ for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
+ _, err := os.Stat(ex.pidFile)
+ if err == nil {
+ break
+ }
+ if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
+ Fatalf("unexpected error waiting for PID file, err: %v", err)
+ }
+
+ log.Infof("Waiting for PID file to be created...")
+ time.Sleep(sleepTime)
+ sleepTime *= sleepTime * 2
+ if sleepTime > 1*time.Second {
+ sleepTime = 1 * time.Second
+ }
+ }
+ *waitStatus = 0
+ return subcommands.ExitSuccess
+ }
+
+ if ex.pidFile != "" {
+ if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
+ Fatalf("error writing pid file: %v", err)
+ }
+ }
+
+ // Get the executable path, which is a bit tricky because we have to
+ // inspect the environment PATH which is relative to the root path.
+ // If the user is overriding environment variables, PATH may have been
+ // overwritten.
+ rootPath := s.Spec.Root.Path
+ e.Filename, err = specutils.GetExecutablePath(e.Argv[0], rootPath, e.Envv)
+ if err != nil {
+ Fatalf("error getting executable path: %v", err)
+ }
+
+ ws, err := s.Execute(e)
+ if err != nil {
+ Fatalf("error getting processes for sandbox: %v", err)
+ }
+ *waitStatus = ws
+ return subcommands.ExitSuccess
+}
+
+// parseArgs parses exec information from the command line or a JSON file
+// depending on whether the --process flag was used. Returns an ExecArgs and
+// the ID of the sandbox to be used.
+func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
+ if ex.processPath == "" {
+ // Requires at least a container ID and command.
+ if f.NArg() < 2 {
+ f.Usage()
+ return nil, "", fmt.Errorf("both a container-id and command are required")
+ }
+ e, err := ex.argsFromCLI(f.Args()[1:])
+ return e, f.Arg(0), err
+ }
+ // Requires only the container ID.
+ if f.NArg() != 1 {
+ f.Usage()
+ return nil, "", fmt.Errorf("a container-id is required")
+ }
+ e, err := ex.argsFromProcessFile()
+ return e, f.Arg(0), err
+}
+
+func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
+ extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
+ for _, s := range ex.extraKGIDs {
+ kgid, err := strconv.Atoi(s)
+ if err != nil {
+ Fatalf("error parsing GID: %s, %v", s, err)
+ }
+ extraKGIDs = append(extraKGIDs, auth.KGID(kgid))
+ }
+
+ caps, err := capabilities(ex.caps)
+ if err != nil {
+ return nil, fmt.Errorf("capabilities error: %v", err)
+ }
+
+ return &control.ExecArgs{
+ Argv: argv,
+ WorkingDirectory: ex.cwd,
+ FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+ KUID: ex.user.kuid,
+ KGID: ex.user.kgid,
+ ExtraKGIDs: extraKGIDs,
+ Capabilities: caps,
+ }, nil
+}
+
+func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
+ f, err := os.Open(ex.processPath)
+ if err != nil {
+ return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
+ }
+ defer f.Close()
+ var p specs.Process
+ if err := json.NewDecoder(f).Decode(&p); err != nil {
+ return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
+ }
+ return argsFromProcess(&p)
+}
+
+// argsFromProcess performs all the non-IO conversion from the Process struct
+// to ExecArgs.
+func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
+ // Create capabilities.
+ caps, err := specutils.Capabilities(p.Capabilities)
+ if err != nil {
+ return nil, fmt.Errorf("error creating capabilities: %v", err)
+ }
+
+ // Convert the spec's additional GIDs to KGIDs.
+ extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids))
+ for _, GID := range p.User.AdditionalGids {
+ extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+ }
+
+ return &control.ExecArgs{
+ Argv: p.Args,
+ Envv: p.Env,
+ WorkingDirectory: p.Cwd,
+ FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+ KUID: auth.KUID(p.User.UID),
+ KGID: auth.KGID(p.User.GID),
+ ExtraKGIDs: extraKGIDs,
+ Capabilities: caps,
+ }, nil
+}
+
+// resolveEnvs transforms lists of environment variables into a single list of
+// environment variables. If a variable is defined multiple times, the last
+// value is used.
+func resolveEnvs(envs ...[]string) ([]string, error) {
+ // First create a map of variable names to values. This removes any
+ // duplicates.
+ envMap := make(map[string]string)
+ for _, env := range envs {
+ for _, str := range env {
+ parts := strings.SplitN(str, "=", 2)
+ if len(parts) != 2 {
+ return nil, fmt.Errorf("invalid variable: %s", str)
+ }
+ envMap[parts[0]] = parts[1]
+ }
+ }
+ // Reassemble envMap into a list of environment variables of the form
+ // NAME=VALUE.
+ env := make([]string, 0, len(envMap))
+ for k, v := range envMap {
+ env = append(env, fmt.Sprintf("%s=%s", k, v))
+ }
+ return env, nil
+}
+
+// capabilities takes a list of capabilities as strings and returns an
+// auth.TaskCapabilities struct with those capabilities in every capability set.
+// This mimics runc's behavior.
+func capabilities(cs []string) (*auth.TaskCapabilities, error) {
+ var specCaps specs.LinuxCapabilities
+ for _, cap := range cs {
+ specCaps.Ambient = append(specCaps.Ambient, cap)
+ specCaps.Bounding = append(specCaps.Bounding, cap)
+ specCaps.Effective = append(specCaps.Effective, cap)
+ specCaps.Inheritable = append(specCaps.Inheritable, cap)
+ specCaps.Permitted = append(specCaps.Permitted, cap)
+ }
+ return specutils.Capabilities(&specCaps)
+}
+
+// stringSlice allows a flag to be used multiple times, where each occurrence
+// adds a value to the flag. For example, a flag called "x" could be invoked
+// via "runsc exec -x foo -x bar", and the corresponding stringSlice would be
+// {"x", "y"}.
+type stringSlice []string
+
+// String implements flag.Value.String.
+func (ss *stringSlice) String() string {
+ return fmt.Sprintf("%v", *ss)
+}
+
+// Get implements flag.Value.Get.
+func (ss *stringSlice) Get() interface{} {
+ return ss
+}
+
+// Set implements flag.Value.Set.
+func (ss *stringSlice) Set(s string) error {
+ *ss = append(*ss, s)
+ return nil
+}
+
+// user allows -user to convey a UID and, optionally, a GID separated by a
+// colon.
+type user struct {
+ kuid auth.KUID
+ kgid auth.KGID
+}
+
+func (u *user) String() string {
+ return fmt.Sprintf("%+v", *u)
+}
+
+func (u *user) Get() interface{} {
+ return u
+}
+
+func (u *user) Set(s string) error {
+ parts := strings.SplitN(s, ":", 2)
+ kuid, err := strconv.Atoi(parts[0])
+ if err != nil {
+ return fmt.Errorf("couldn't parse UID: %s", parts[0])
+ }
+ u.kuid = auth.KUID(kuid)
+ if len(parts) > 1 {
+ kgid, err := strconv.Atoi(parts[1])
+ if err != nil {
+ return fmt.Errorf("couldn't parse GID: %s", parts[1])
+ }
+ u.kgid = auth.KGID(kgid)
+ }
+ return nil
+}
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
new file mode 100644
index 000000000..623461e78
--- /dev/null
+++ b/runsc/cmd/exec_test.go
@@ -0,0 +1,154 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "os"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ "github.com/google/go-cmp/cmp/cmpopts"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+func TestUser(t *testing.T) {
+ testCases := []struct {
+ input string
+ want user
+ wantErr bool
+ }{
+ {input: "0", want: user{kuid: 0, kgid: 0}},
+ {input: "7", want: user{kuid: 7, kgid: 0}},
+ {input: "49:343", want: user{kuid: 49, kgid: 343}},
+ {input: "0:2401", want: user{kuid: 0, kgid: 2401}},
+ {input: "", wantErr: true},
+ {input: "foo", wantErr: true},
+ {input: ":123", wantErr: true},
+ {input: "1:2:3", wantErr: true},
+ }
+
+ for _, tc := range testCases {
+ var u user
+ if err := u.Set(tc.input); err != nil && tc.wantErr {
+ // We got an error and wanted one.
+ continue
+ } else if err == nil && tc.wantErr {
+ t.Errorf("user.Set(%s): got no error, but wanted one", tc.input)
+ } else if err != nil && !tc.wantErr {
+ t.Errorf("user.Set(%s): got error %v, but wanted none", tc.input, err)
+ } else if u != tc.want {
+ t.Errorf("user.Set(%s): got %+v, but wanted %+v", tc.input, u, tc.want)
+ }
+ }
+}
+
+func TestCLIArgs(t *testing.T) {
+ testCases := []struct {
+ ex Exec
+ argv []string
+ expected control.ExecArgs
+ }{
+ {
+ ex: Exec{
+ cwd: "/foo/bar",
+ user: user{kuid: 0, kgid: 0},
+ extraKGIDs: []string{"1", "2", "3"},
+ caps: []string{"CAP_DAC_OVERRIDE"},
+ processPath: "",
+ },
+ argv: []string{"ls", "/"},
+ expected: control.ExecArgs{
+ Argv: []string{"ls", "/"},
+ WorkingDirectory: "/foo/bar",
+ FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+ KUID: 0,
+ KGID: 0,
+ ExtraKGIDs: []auth.KGID{1, 2, 3},
+ Capabilities: &auth.TaskCapabilities{
+ BoundingCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ PermittedCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ },
+ },
+ },
+ }
+
+ for _, tc := range testCases {
+ e, err := tc.ex.argsFromCLI(tc.argv)
+ if err != nil {
+ t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err)
+ } else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
+ t.Errorf("argsFromCLI(%+v): got %+v, but expected %+v", tc.ex, *e, tc.expected)
+ }
+ }
+}
+
+func TestJSONArgs(t *testing.T) {
+ testCases := []struct {
+ // ex is provided to make sure it is overridden by p.
+ ex Exec
+ p specs.Process
+ expected control.ExecArgs
+ }{
+ {
+ ex: Exec{
+ cwd: "/baz/quux",
+ user: user{kuid: 1, kgid: 1},
+ extraKGIDs: []string{"4", "5", "6"},
+ caps: []string{"CAP_SETGID"},
+ processPath: "/bin/foo",
+ },
+ p: specs.Process{
+ User: specs.User{UID: 0, GID: 0, AdditionalGids: []uint32{1, 2, 3}},
+ Args: []string{"ls", "/"},
+ Cwd: "/foo/bar",
+ Capabilities: &specs.LinuxCapabilities{
+ Bounding: []string{"CAP_DAC_OVERRIDE"},
+ Effective: []string{"CAP_DAC_OVERRIDE"},
+ Inheritable: []string{"CAP_DAC_OVERRIDE"},
+ Permitted: []string{"CAP_DAC_OVERRIDE"},
+ },
+ },
+ expected: control.ExecArgs{
+ Argv: []string{"ls", "/"},
+ WorkingDirectory: "/foo/bar",
+ FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+ KUID: 0,
+ KGID: 0,
+ ExtraKGIDs: []auth.KGID{1, 2, 3},
+ Capabilities: &auth.TaskCapabilities{
+ BoundingCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ PermittedCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ },
+ },
+ },
+ }
+
+ for _, tc := range testCases {
+ e, err := argsFromProcess(&tc.p)
+ if err != nil {
+ t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err)
+ } else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
+ t.Errorf("argsFromProcess(%+v): got %+v, but expected %+v", tc.p, *e, tc.expected)
+ }
+ }
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
new file mode 100644
index 000000000..844e16dbf
--- /dev/null
+++ b/runsc/cmd/gofer.go
@@ -0,0 +1,134 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "sync"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/p9"
+ "gvisor.googlesource.com/gvisor/pkg/unet"
+ "gvisor.googlesource.com/gvisor/runsc/fsgofer"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Gofer implements subcommands.Command for the "gofer" command, which starts a
+// filesystem gofer. This command should not be called directly.
+type Gofer struct {
+ bundleDir string
+ ioFDs intFlags
+}
+
+// Name implements subcommands.Command.
+func (*Gofer) Name() string {
+ return "gofer"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Gofer) Synopsis() string {
+ return "launch a gofer process that server files over 9P protocol (internal use only)"
+}
+
+// Usage implements subcommands.Command.
+func (*Gofer) Usage() string {
+ return `gofer [flags]`
+}
+
+// SetFlags implements subcommands.Command.
+func (g *Gofer) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+ f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
+}
+
+// Execute implements subcommands.Command.
+func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if g.bundleDir == "" || len(g.ioFDs) < 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ spec, err := specutils.ReadSpec(g.bundleDir)
+ if err != nil {
+ Fatalf("error reading spec: %v", err)
+ }
+ specutils.LogSpec(spec)
+
+ // Start with root mount, then add any other addition mount as needed.
+ ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
+ p := absPath(g.bundleDir, spec.Root.Path)
+ ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
+ ROMount: spec.Root.Readonly,
+ // Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when
+ // each file is opened as writable. Thus, we open files lazily to avoid copy-up.
+ LazyOpenForWrite: true,
+ }))
+ log.Infof("Serving %q mapped to %q on FD %d", "/", p, g.ioFDs[0])
+
+ mountIdx := 1 // first one is the root
+ for _, m := range spec.Mounts {
+ if specutils.Is9PMount(m) {
+ p = absPath(g.bundleDir, m.Source)
+ ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
+ ROMount: isReadonlyMount(m.Options),
+ LazyOpenForWrite: false,
+ }))
+
+ if mountIdx >= len(g.ioFDs) {
+ Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
+ }
+ log.Infof("Serving %q mapped to %q on FD %d", m.Destination, p, g.ioFDs[mountIdx])
+ mountIdx++
+ }
+ }
+ if mountIdx != len(g.ioFDs) {
+ Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
+ }
+
+ runServers(ats, g.ioFDs)
+ return subcommands.ExitSuccess
+}
+
+func runServers(ats []p9.Attacher, ioFDs []int) {
+ // Run the loops and wait for all to exit.
+ var wg sync.WaitGroup
+ for i, ioFD := range ioFDs {
+ wg.Add(1)
+ go func(ioFD int, at p9.Attacher) {
+ socket, err := unet.NewSocket(ioFD)
+ if err != nil {
+ Fatalf("err creating server on FD %d: %v", ioFD, err)
+ }
+ s := p9.NewServer(at)
+ if err := s.Handle(socket); err != nil {
+ Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
+ }
+ wg.Done()
+ }(ioFD, ats[i])
+ }
+ wg.Wait()
+ log.Infof("All 9P servers exited.")
+}
+
+func isReadonlyMount(opts []string) bool {
+ for _, o := range opts {
+ if o == "ro" {
+ return true
+ }
+ }
+ return false
+}
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
new file mode 100644
index 000000000..f89e0077e
--- /dev/null
+++ b/runsc/cmd/kill.go
@@ -0,0 +1,142 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+ "syscall"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Kill implements subcommands.Command for the "kill" command.
+type Kill struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Kill) Name() string {
+ return "kill"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Kill) Synopsis() string {
+ return "sends a signal to the sandbox"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Kill) Usage() string {
+ return `kill <container id> [signal]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Kill) SetFlags(f *flag.FlagSet) {
+ // TODO: Implement this flag. It is defined here just to
+ // prevent runsc from crashing if it is passed.
+ var all bool
+ f.BoolVar(&all, "all", false, "send the specified signal to all processes inside the container")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() == 0 || f.NArg() > 2 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ s, err := sandbox.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("error loading sandbox: %v", err)
+ }
+
+ // The OCI command-line spec says that the signal should be specified
+ // via a flag, but runc (and things that call runc) pass it as an
+ // argument.
+ signal := f.Arg(2)
+ if signal == "" {
+ signal = "TERM"
+ }
+
+ sig, err := parseSignal(signal)
+ if err != nil {
+ Fatalf("%v", err)
+ }
+ if err := s.Signal(sig); err != nil {
+ Fatalf("%v", err)
+ }
+ return subcommands.ExitSuccess
+}
+
+func parseSignal(s string) (syscall.Signal, error) {
+ n, err := strconv.Atoi(s)
+ if err == nil {
+ sig := syscall.Signal(n)
+ for _, msig := range signalMap {
+ if sig == msig {
+ return sig, nil
+ }
+ }
+ return -1, fmt.Errorf("unknown signal %q", s)
+ }
+ if sig, ok := signalMap[strings.TrimPrefix(strings.ToUpper(s), "SIG")]; ok {
+ return sig, nil
+ }
+ return -1, fmt.Errorf("unknown signal %q", s)
+}
+
+var signalMap = map[string]syscall.Signal{
+ "ABRT": unix.SIGABRT,
+ "ALRM": unix.SIGALRM,
+ "BUS": unix.SIGBUS,
+ "CHLD": unix.SIGCHLD,
+ "CLD": unix.SIGCLD,
+ "CONT": unix.SIGCONT,
+ "FPE": unix.SIGFPE,
+ "HUP": unix.SIGHUP,
+ "ILL": unix.SIGILL,
+ "INT": unix.SIGINT,
+ "IO": unix.SIGIO,
+ "IOT": unix.SIGIOT,
+ "KILL": unix.SIGKILL,
+ "PIPE": unix.SIGPIPE,
+ "POLL": unix.SIGPOLL,
+ "PROF": unix.SIGPROF,
+ "PWR": unix.SIGPWR,
+ "QUIT": unix.SIGQUIT,
+ "SEGV": unix.SIGSEGV,
+ "STKFLT": unix.SIGSTKFLT,
+ "STOP": unix.SIGSTOP,
+ "SYS": unix.SIGSYS,
+ "TERM": unix.SIGTERM,
+ "TRAP": unix.SIGTRAP,
+ "TSTP": unix.SIGTSTP,
+ "TTIN": unix.SIGTTIN,
+ "TTOU": unix.SIGTTOU,
+ "URG": unix.SIGURG,
+ "USR1": unix.SIGUSR1,
+ "USR2": unix.SIGUSR2,
+ "VTALRM": unix.SIGVTALRM,
+ "WINCH": unix.SIGWINCH,
+ "XCPU": unix.SIGXCPU,
+ "XFSZ": unix.SIGXFSZ,
+}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
new file mode 100644
index 000000000..bf7cb41bb
--- /dev/null
+++ b/runsc/cmd/list.go
@@ -0,0 +1,117 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "text/tabwriter"
+ "time"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// List implements subcommands.Command for the "list" command for the "list" command.
+type List struct {
+ quiet bool
+ format string
+}
+
+// Name implements subcommands.command.name.
+func (*List) Name() string {
+ return "list"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*List) Synopsis() string {
+ return "list contaners started by runsc with the given root"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*List) Usage() string {
+ return `list [flags]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (l *List) SetFlags(f *flag.FlagSet) {
+ f.BoolVar(&l.quiet, "quiet", false, "only list container ids")
+ f.StringVar(&l.format, "format", "text", "output format: 'text' (default) or 'json'")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 0 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ conf := args[0].(*boot.Config)
+ ids, err := sandbox.List(conf.RootDir)
+ if err != nil {
+ Fatalf("%v", err)
+ }
+
+ if l.quiet {
+ for _, id := range ids {
+ fmt.Println(id)
+ }
+ return subcommands.ExitSuccess
+ }
+
+ // Collect the sandboxes.
+ var sandboxes []*sandbox.Sandbox
+ for _, id := range ids {
+ s, err := sandbox.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("error loading sandbox %q: %v", id, err)
+ }
+ sandboxes = append(sandboxes, s)
+ }
+
+ switch l.format {
+ case "text":
+ // Print a nice table.
+ w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
+ fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
+ for _, s := range sandboxes {
+ fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
+ s.ID,
+ s.Pid,
+ s.Status,
+ s.BundleDir,
+ s.CreatedAt.Format(time.RFC3339Nano),
+ s.Owner)
+ }
+ w.Flush()
+ case "json":
+ // Print just the states.
+ var states []specs.State
+ for _, s := range sandboxes {
+ states = append(states, s.State())
+ }
+ if err := json.NewEncoder(os.Stdout).Encode(states); err != nil {
+ Fatalf("error marshaling sandbox state: %v", err)
+ }
+ default:
+ Fatalf("unknown list format %q", l.format)
+ }
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
new file mode 100644
index 000000000..4bb1dbb4f
--- /dev/null
+++ b/runsc/cmd/path.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "os"
+ "path/filepath"
+)
+
+// absPath turns the given path into an absolute path (if it is not already
+// absolute) by prepending the base path.
+func absPath(base, rel string) string {
+ if filepath.IsAbs(rel) {
+ return rel
+ }
+ return filepath.Join(base, rel)
+}
+
+// getwdOrDie returns the current working directory and dies if it cannot.
+func getwdOrDie() string {
+ wd, err := os.Getwd()
+ if err != nil {
+ Fatalf("error getting current working directory: %v", err)
+ }
+ return wd
+}
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
new file mode 100644
index 000000000..a667ec04c
--- /dev/null
+++ b/runsc/cmd/ps.go
@@ -0,0 +1,86 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "fmt"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// PS implements subcommands.Command for the "ps" command.
+type PS struct {
+ format string
+}
+
+// Name implements subcommands.Command.Name.
+func (*PS) Name() string {
+ return "ps"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*PS) Synopsis() string {
+ return "ps displays the processes running inside a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*PS) Usage() string {
+ return "<container-id> [ps options]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ps *PS) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&ps.format, "format", "table", "output format. Select one of: table or json (default: table)")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ s, err := sandbox.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("error loading sandox: %v", err)
+ }
+ pList, err := s.Processes()
+ if err != nil {
+ Fatalf("error getting processes for sandbox: %v", err)
+ }
+
+ switch ps.format {
+ case "table":
+ fmt.Println(control.ProcessListToTable(pList))
+ case "json":
+ o, err := control.PrintPIDsJSON(pList)
+ if err != nil {
+ Fatalf("error generating JSON: %v", err)
+ }
+ fmt.Println(o)
+ default:
+ Fatalf("Unsupported format: %s", ps.format)
+ }
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
new file mode 100644
index 000000000..a61a6c73e
--- /dev/null
+++ b/runsc/cmd/run.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "syscall"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Run implements subcommands.Command for the "run" command.
+type Run struct {
+ // Run flags are a super-set of those for Create.
+ Create
+}
+
+// Name implements subcommands.Command.Name.
+func (*Run) Name() string {
+ return "run"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Run) Synopsis() string {
+ return "create and run a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Run) Usage() string {
+ return `run [flags] <container id> - create and run a secure container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Run) SetFlags(f *flag.FlagSet) {
+ r.Create.SetFlags(f)
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+ waitStatus := args[1].(*syscall.WaitStatus)
+
+ bundleDir := r.bundleDir
+ if bundleDir == "" {
+ bundleDir = getwdOrDie()
+ }
+ spec, err := specutils.ReadSpec(bundleDir)
+ if err != nil {
+ Fatalf("error reading spec: %v", err)
+ }
+
+ ws, err := sandbox.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, commandLineFlags())
+ if err != nil {
+ Fatalf("error running sandbox: %v", err)
+ }
+
+ *waitStatus = ws
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
new file mode 100644
index 000000000..a8e132497
--- /dev/null
+++ b/runsc/cmd/start.go
@@ -0,0 +1,64 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Start implements subcommands.Command for the "start" command.
+type Start struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Start) Name() string {
+ return "start"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Start) Synopsis() string {
+ return "start a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Start) Usage() string {
+ return `start <container id> - start a secure container.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Start) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ s, err := sandbox.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("error loading sandbox: %v", err)
+ }
+ if err := s.Start(conf); err != nil {
+ Fatalf("error starting sandbox: %v", err)
+ }
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
new file mode 100644
index 000000000..0b47f290a
--- /dev/null
+++ b/runsc/cmd/state.go
@@ -0,0 +1,73 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "encoding/json"
+ "os"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// State implements subcommands.Command for the "state" command.
+type State struct{}
+
+// Name implements subcommands.Command.Name.
+func (*State) Name() string {
+ return "state"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*State) Synopsis() string {
+ return "get the state of a sandbox"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*State) Usage() string {
+ return `state [flags] <container id> - get the state of a sandbox`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*State) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if f.NArg() != 1 {
+ f.Usage()
+ return subcommands.ExitUsageError
+ }
+
+ id := f.Arg(0)
+ conf := args[0].(*boot.Config)
+
+ s, err := sandbox.Load(conf.RootDir, id)
+ if err != nil {
+ Fatalf("error loading sandbox: %v", err)
+ }
+ log.Debugf("Returning state %+v", s)
+
+ // Write json-encoded state directly to stdout.
+ b, err := json.MarshalIndent(s.State(), "", " ")
+ if err != nil {
+ Fatalf("error marshaling sandbox state: %v", err)
+ }
+ os.Stdout.Write(b)
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
new file mode 100644
index 000000000..24e172f48
--- /dev/null
+++ b/runsc/fsgofer/BUILD
@@ -0,0 +1,33 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "fsgofer",
+ srcs = [
+ "fsgofer.go",
+ "fsgofer_unsafe.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/runsc/fsgofer",
+ visibility = [
+ "//runsc:__subpackages__",
+ ],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/fd",
+ "//pkg/log",
+ "//pkg/p9",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+go_test(
+ name = "fsgofer_test",
+ size = "small",
+ srcs = ["fsgofer_test.go"],
+ embed = [":fsgofer"],
+ deps = [
+ "//pkg/log",
+ "//pkg/p9",
+ ],
+)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
new file mode 100644
index 000000000..5ddc75a9d
--- /dev/null
+++ b/runsc/fsgofer/fsgofer.go
@@ -0,0 +1,937 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsgofer implements p9.File giving access to local files using
+// a simple mapping from a path prefix that is added to the path requested
+// by the sandbox. Ex:
+//
+// prefix: "/docker/imgs/alpine"
+// app path: /bin/ls => /docker/imgs/alpine/bin/ls
+package fsgofer
+
+import (
+ "fmt"
+ "io"
+ "math"
+ "os"
+ "path"
+ "path/filepath"
+ "strings"
+ "sync"
+ "syscall"
+
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/p9"
+)
+
+const (
+ // invalidMode is set to a value that doesn't match any other valid
+ // modes to ensure an unopened/closed file fails all mode checks.
+ invalidMode = p9.OpenFlags(math.MaxUint32)
+
+ openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
+)
+
+type fileType int
+
+const (
+ regular fileType = iota
+ directory
+ symlink
+)
+
+// String implements fmt.Stringer.
+func (f fileType) String() string {
+ switch f {
+ case regular:
+ return "regular"
+ case directory:
+ return "directory"
+ case symlink:
+ return "symlink"
+ }
+ return "unknown"
+}
+
+// Config sets configuration options for each attach point.
+type Config struct {
+ // ROMount is set to true if this is a readonly mount.
+ ROMount bool
+
+ // LazyOpenForWrite makes the underlying file to be opened in RDONLY
+ // mode initially and be reopened in case write access is desired.
+ // This is done to workaround the behavior in 'overlay2' that
+ // copies the entire file up eagerly when it's opened in write mode
+ // even if the file is never actually written to.
+ LazyOpenForWrite bool
+}
+
+type attachPoint struct {
+ prefix string
+ conf Config
+}
+
+// NewAttachPoint creates a new attacher that gives local file
+// access to all files under 'prefix'.
+func NewAttachPoint(prefix string, c Config) p9.Attacher {
+ return &attachPoint{prefix: prefix, conf: c}
+}
+
+// Attach implements p9.Attacher.
+func (a *attachPoint) Attach(appPath string) (p9.File, error) {
+ if !path.IsAbs(appPath) {
+ return nil, fmt.Errorf("invalid path %q", appPath)
+ }
+
+ root := filepath.Join(a.prefix, appPath)
+ f, err := os.OpenFile(root, openFlags|syscall.O_RDONLY, 0)
+ if err != nil {
+ return nil, fmt.Errorf("unable to open file %q, err: %v", root, err)
+ }
+ stat, err := stat(int(f.Fd()))
+ if err != nil {
+ return nil, fmt.Errorf("failed to stat file %q, err: %v", root, err)
+ }
+ return newLocalFile(a.conf, f, root, stat)
+}
+
+func makeQID(stat syscall.Stat_t) p9.QID {
+ return p9.QID{
+ Type: p9.FileMode(stat.Mode).QIDType(),
+ Path: stat.Ino,
+ }
+}
+
+func isNameValid(name string) bool {
+ if name == "" || name == "." || name == ".." {
+ log.Warningf("Invalid name: %s", name)
+ return false
+ }
+ if strings.IndexByte(name, '/') >= 0 {
+ log.Warningf("Invalid name: %s", name)
+ return false
+ }
+ return true
+}
+
+// localFile implements p9.File wrapping a local file. The underlying file
+// is opened during Walk() and stored in 'controlFile' to be used with other
+// operations. The mode in which the file is opened varies depending on the
+// configuration (see below). 'controlFile' is dup'ed when Walk(nil) is called
+// to clone the file.
+//
+// 'openedFile' is assigned when Open() is called. If requested open mode is
+// a subset of controlFile's mode, it's possible to use the same file. If mode
+// is not a subset, then another file is opened. Consequently, 'openedFile'
+// could have a mode wider than requested and must be verified before read/write
+// operations. Before the file is opened and after it's closed, 'mode' is set to
+// an invalid value to prevent an unopened file from being used.
+//
+// localFile has 2 modes of operation based on the configuration:
+//
+// ** conf.lazyRWOpen == false **
+// This is the preferred mode. 'controlFile' is opened in RW mode in Walk()
+// and used across all functions. The file is never reopened as the mode will
+// always be a super set of the requested open mode. This reduces the number of
+// syscalls required per operation and makes it resilient to renames anywhere
+// in the path to the file.
+//
+// ** conf.lazyRWOpen == true **
+// This mode is used for better performance with 'overlay2' storage driver.
+// overlay2 eagerly copies the entire file up when it's opened in write mode
+// which makes the mode above perform badly when serveral of files are opened
+// for read (esp. startup). In this mode, 'controlFile' is opened as readonly
+// (or O_PATH for symlinks). Reopening the file is required if write mode
+// is requested in Open().
+type localFile struct {
+ p9.DefaultWalkGetAttr
+
+ // mu protects 'hostPath' when file is renamed.
+ mu sync.Mutex
+
+ // TODO: hostPath is not safe to use as path needs to be walked
+ // everytime (and can change underneath us). Remove all usages.
+ hostPath string
+
+ // controlFile is opened when localFile is created and it's never nil.
+ controlFile *os.File
+
+ // openedFile is nil until localFile is opened. It may point to controlFile
+ // or be a new file struct. See struct comment for more details.
+ openedFile *os.File
+
+ // mode is the mode in which the file was opened. Set to invalidMode
+ // if localFile isn't opened.
+ mode p9.OpenFlags
+
+ ft fileType
+
+ conf Config
+
+ // readDirMu protects against concurrent Readdir calls.
+ readDirMu sync.Mutex
+}
+
+func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
+ // Attempt to open file in the following mode in order:
+ // 1. RDWR: for files with rw mounts and LazyOpenForWrite disabled
+ // 2. RDONLY: for directories, ro mounts or LazyOpenForWrite enabled
+ // 3. PATH: for symlinks
+ modes := []int{syscall.O_RDWR, syscall.O_RDONLY, unix.O_PATH}
+ symlinkIdx := len(modes) - 1
+
+ startIdx := 0
+ if parent.conf.ROMount || parent.conf.LazyOpenForWrite {
+ // Skip attempt to open in RDWR based on configuration.
+ startIdx = 1
+ }
+
+ var err error
+ var fd int
+ for i := startIdx; i < len(modes); i++ {
+ fd, err = syscall.Openat(parent.controlFD(), name, openFlags|modes[i], 0)
+ if err == nil {
+ // openat succeeded, we're done.
+ break
+ }
+ switch e := extractErrno(err); e {
+ case syscall.ENOENT:
+ // File doesn't exist, no point in retrying.
+ return nil, "", e
+ case syscall.ELOOP:
+ if i < symlinkIdx {
+ // File was opened with O_NOFOLLOW, so this error can only happen when
+ // trying ot open a symlink. Jump straight to flags compatible with symlink.
+ i = symlinkIdx - 1
+ }
+ }
+ // openat failed. Try again with next mode, preserving 'err' in
+ // case this was the last attempt.
+ log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|modes[i], parent.controlFile.Name(), name, err)
+ }
+ if err != nil {
+ // All attempts to open file have failed, return the last error.
+ log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.controlFile.Name(), name, err)
+ return nil, "", extractErrno(err)
+ }
+
+ parent.mu.Lock()
+ defer parent.mu.Unlock()
+ newPath := path.Join(parent.hostPath, name)
+
+ return os.NewFile(uintptr(fd), newPath), newPath, nil
+}
+
+func newLocalFile(conf Config, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) {
+ var ft fileType
+ switch stat.Mode & syscall.S_IFMT {
+ case syscall.S_IFREG:
+ ft = regular
+ case syscall.S_IFDIR:
+ ft = directory
+ case syscall.S_IFLNK:
+ ft = symlink
+ default:
+ return nil, syscall.EINVAL
+ }
+ return &localFile{
+ hostPath: path,
+ controlFile: file,
+ conf: conf,
+ mode: invalidMode,
+ ft: ft,
+ }, nil
+}
+
+// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as
+// non-blocking. If anything fails, returns nil. It's better to have a file
+// without host FD, than to fail the operation.
+func newFDMaybe(file *os.File) *fd.FD {
+ fd, err := fd.NewFromFile(file)
+ if err != nil {
+ return nil
+ }
+
+ // fd is blocking; non-blocking is required.
+ if err := syscall.SetNonblock(fd.FD(), true); err != nil {
+ fd.Close()
+ return nil
+ }
+ return fd
+}
+
+func stat(fd int) (syscall.Stat_t, error) {
+ var stat syscall.Stat_t
+ if err := syscall.Fstat(fd, &stat); err != nil {
+ return syscall.Stat_t{}, err
+ }
+ return stat, nil
+}
+
+func fchown(fd int, uid p9.UID, gid p9.GID) error {
+ return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+}
+
+func (l *localFile) controlFD() int {
+ return int(l.controlFile.Fd())
+}
+
+func (l *localFile) openedFD() int {
+ if l.openedFile == nil {
+ panic(fmt.Sprintf("trying to use an unopened file: %q", l.controlFile.Name()))
+ }
+ return int(l.openedFile.Fd())
+}
+
+// Open implements p9.File.
+func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+ if l.openedFile != nil {
+ panic(fmt.Sprintf("attempting to open already opened file: %q", l.controlFile.Name()))
+ }
+
+ // Check if control file can be used or if a new open must be created.
+ var newFile *os.File
+ if mode == p9.ReadOnly || !l.conf.LazyOpenForWrite {
+ log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name())
+ newFile = l.controlFile
+ } else {
+ // Ideally reopen would call name_to_handle_at (with empty name) and open_by_handle_at
+ // to reopen the file without using 'hostPath'. However, name_to_handle_at and
+ // open_by_handle_at aren't supported by overlay2.
+ log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
+ var err error
+
+ l.mu.Lock()
+ newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
+ if err != nil {
+ l.mu.Unlock()
+ return nil, p9.QID{}, 0, extractErrno(err)
+ }
+ l.mu.Unlock()
+ }
+
+ stat, err := stat(int(newFile.Fd()))
+ if err != nil {
+ newFile.Close()
+ return nil, p9.QID{}, 0, extractErrno(err)
+ }
+
+ var fd *fd.FD
+ if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+ // Donate FD for regular files only.
+ fd = newFDMaybe(newFile)
+ }
+
+ // Set fields on success
+ l.openedFile = newFile
+ l.mode = mode
+ return fd, makeQID(stat), 0, nil
+}
+
+// Create implements p9.File.
+func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
+ if l.conf.ROMount {
+ return nil, nil, p9.QID{}, 0, syscall.EBADF
+ }
+ if !isNameValid(name) {
+ return nil, nil, p9.QID{}, 0, syscall.EINVAL
+ }
+
+ // Use a single file for both 'controlFile' and 'openedFile'. Mode must include read for control
+ // and whichever else was requested by caller. Note that resulting file might have a wider mode
+ // than needed for each particular case.
+ flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+ if mode == p9.WriteOnly {
+ flags |= syscall.O_RDWR
+ } else {
+ flags |= mode.OSFlags()
+ }
+
+ fd, err := syscall.Openat(l.controlFD(), name, flags, uint32(perm.Permissions()))
+ if err != nil {
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+ if err := fchown(fd, uid, gid); err != nil {
+ syscall.Close(fd)
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+ stat, err := stat(fd)
+ if err != nil {
+ syscall.Close(fd)
+ return nil, nil, p9.QID{}, 0, extractErrno(err)
+ }
+
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ cPath := path.Join(l.hostPath, name)
+ f := os.NewFile(uintptr(fd), cPath)
+ c := &localFile{
+ hostPath: cPath,
+ controlFile: f,
+ openedFile: f,
+ mode: mode,
+ conf: l.conf,
+ }
+ return newFDMaybe(c.openedFile), c, makeQID(stat), 0, nil
+}
+
+// Mkdir implements p9.File.
+func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ if l.conf.ROMount {
+ return p9.QID{}, syscall.EBADF
+ }
+
+ if !isNameValid(name) {
+ return p9.QID{}, syscall.EINVAL
+ }
+
+ if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+
+ // Open directory to change ownership and stat it.
+ flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+ fd, err := syscall.Openat(l.controlFD(), name, flags, 0)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ defer syscall.Close(fd)
+
+ if err := fchown(fd, uid, gid); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ stat, err := stat(fd)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ return makeQID(stat), nil
+}
+
+// Walk implements p9.File.
+func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+ // Duplicate current file if 'names' is empty.
+ if len(names) == 0 {
+ newFd, err := syscall.Dup(l.controlFD())
+ if err != nil {
+ return nil, nil, extractErrno(err)
+ }
+ stat, err := stat(newFd)
+ if err != nil {
+ syscall.Close(newFd)
+ return nil, nil, extractErrno(err)
+ }
+
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ c := &localFile{
+ hostPath: l.hostPath,
+ controlFile: os.NewFile(uintptr(newFd), l.hostPath),
+ mode: invalidMode,
+ conf: l.conf,
+ }
+ return []p9.QID{makeQID(stat)}, c, nil
+ }
+
+ var qids []p9.QID
+ last := l
+ for _, name := range names {
+ if !isNameValid(name) {
+ return nil, nil, syscall.EINVAL
+ }
+
+ f, path, err := openAnyFile(last, name)
+ if err != nil {
+ return nil, nil, extractErrno(err)
+ }
+ stat, err := stat(int(f.Fd()))
+ if err != nil {
+ return nil, nil, extractErrno(err)
+ }
+ c, err := newLocalFile(last.conf, f, path, stat)
+ if err != nil {
+ return nil, nil, extractErrno(err)
+ }
+
+ qids = append(qids, makeQID(stat))
+ last = c
+ }
+ return qids, last, nil
+}
+
+// StatFS implements p9.File.
+func (l *localFile) StatFS() (p9.FSStat, error) {
+ var s syscall.Statfs_t
+ if err := syscall.Fstatfs(l.controlFD(), &s); err != nil {
+ return p9.FSStat{}, extractErrno(err)
+ }
+
+ // Populate with what's available.
+ return p9.FSStat{
+ Type: uint32(s.Type),
+ BlockSize: uint32(s.Bsize),
+ Blocks: s.Blocks,
+ BlocksFree: s.Bfree,
+ BlocksAvailable: s.Bavail,
+ Files: s.Files,
+ FilesFree: s.Ffree,
+ NameLength: uint32(s.Namelen),
+ }, nil
+}
+
+// FSync implements p9.File.
+func (l *localFile) FSync() error {
+ if l.openedFile == nil {
+ return syscall.EBADF
+ }
+ if err := l.openedFile.Sync(); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// GetAttr implements p9.File.
+func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+ stat, err := stat(l.controlFD())
+ if err != nil {
+ return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
+ }
+
+ attr := p9.Attr{
+ Mode: p9.FileMode(stat.Mode),
+ UID: p9.UID(stat.Uid),
+ GID: p9.GID(stat.Gid),
+ NLink: stat.Nlink,
+ RDev: stat.Rdev,
+ Size: uint64(stat.Size),
+ BlockSize: uint64(stat.Blksize),
+ Blocks: uint64(stat.Blocks),
+ ATimeSeconds: uint64(stat.Atim.Sec),
+ ATimeNanoSeconds: uint64(stat.Atim.Nsec),
+ MTimeSeconds: uint64(stat.Mtim.Sec),
+ MTimeNanoSeconds: uint64(stat.Mtim.Nsec),
+ CTimeSeconds: uint64(stat.Ctim.Sec),
+ CTimeNanoSeconds: uint64(stat.Ctim.Nsec),
+ }
+ valid := p9.AttrMask{
+ Mode: true,
+ UID: true,
+ GID: true,
+ NLink: true,
+ RDev: true,
+ Size: true,
+ Blocks: true,
+ ATime: true,
+ MTime: true,
+ CTime: true,
+ }
+
+ return makeQID(stat), valid, attr, nil
+}
+
+// SetAttr implements p9.File. Due to mismatch in file API, options
+// cannot be changed atomicaly and user may see partial changes when
+// an error happens.
+func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
+ if l.conf.ROMount {
+ return syscall.EBADF
+ }
+
+ allowed := p9.SetAttrMask{
+ Permissions: true,
+ UID: true,
+ GID: true,
+ Size: true,
+ ATime: true,
+ MTime: true,
+ ATimeNotSystemTime: true,
+ MTimeNotSystemTime: true,
+ }
+
+ if valid.Empty() {
+ // Nothing to do.
+ return nil
+ }
+
+ // Handle all the sanity checks up front so that the client gets a
+ // consistent result that is not attribute dependent.
+ if !valid.IsSubsetOf(allowed) {
+ log.Warningf("SetAttr() failed for %q, mask: %v", l.controlFile.Name(), valid)
+ return syscall.EPERM
+ }
+
+ fd := l.controlFD()
+ if l.conf.LazyOpenForWrite && l.ft == regular {
+ // Regular files are opened in RO mode when lazy open is set.
+ // Thus it needs to be reopened here for write.
+ f, err := os.OpenFile(l.hostPath, openFlags|os.O_WRONLY, 0)
+ if err != nil {
+ return extractErrno(err)
+ }
+ defer f.Close()
+ fd = int(f.Fd())
+ }
+
+ // The semantics are to either return an error if no changes were made,
+ // or no error if *all* changes were made. Well, this can be impossible
+ // if the filesystem rejects at least one of the changes, especially
+ // since some operations are not easy to undo atomically.
+ //
+ // This could be made better if SetAttr actually returned the changes
+ // it did make, so the client can at least know what has changed. So
+ // we at least attempt to make all of the changes and return a generic
+ // error if any of them fails, which at least doesn't bias any change
+ // over another.
+ var err error
+ if valid.Permissions {
+ if cerr := syscall.Fchmod(fd, uint32(attr.Permissions)); cerr != nil {
+ log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
+ err = extractErrno(cerr)
+ }
+ }
+
+ if valid.Size {
+ if terr := syscall.Ftruncate(fd, int64(attr.Size)); terr != nil {
+ log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ }
+
+ if valid.ATime || valid.MTime {
+ utimes := [2]syscall.Timespec{
+ syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
+ syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
+ }
+ if valid.ATime {
+ if valid.ATimeNotSystemTime {
+ utimes[0].Sec = int64(attr.ATimeSeconds)
+ utimes[0].Nsec = int64(attr.ATimeNanoSeconds)
+ } else {
+ utimes[0].Nsec = linux.UTIME_NOW
+ }
+ }
+ if valid.MTime {
+ if valid.MTimeNotSystemTime {
+ utimes[1].Sec = int64(attr.MTimeSeconds)
+ utimes[1].Nsec = int64(attr.MTimeNanoSeconds)
+ } else {
+ utimes[1].Nsec = linux.UTIME_NOW
+ }
+ }
+
+ if l.ft == symlink {
+ // utimensat operates different that other syscalls. To operate on a
+ // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
+ // name.
+ f, err := os.OpenFile(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+ if err != nil {
+ return extractErrno(err)
+ }
+ defer f.Close()
+
+ if terr := utimensat(int(f.Fd()), path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
+ log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ } else {
+ // Directories and regular files can operate directly on the fd
+ // using empty name.
+ if terr := utimensat(fd, "", utimes, 0); terr != nil {
+ log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+ err = extractErrno(terr)
+ }
+ }
+ }
+
+ if valid.UID || valid.GID {
+ uid := -1
+ if valid.UID {
+ uid = int(attr.UID)
+ }
+ gid := -1
+ if valid.GID {
+ gid = int(attr.GID)
+ }
+ if oerr := syscall.Fchownat(fd, "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+ log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
+ err = extractErrno(oerr)
+ }
+ }
+
+ return err
+}
+
+// Remove implements p9.File.
+//
+// This is deprecated in favor of UnlinkAt.
+func (*localFile) Remove() error {
+ return syscall.ENOSYS
+}
+
+// Rename implements p9.File.
+func (l *localFile) Rename(directory p9.File, name string) error {
+ if l.conf.ROMount {
+ return syscall.EBADF
+ }
+ if !isNameValid(name) {
+ return syscall.EINVAL
+ }
+
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ // TODO: change to renameat(2)
+ parent := directory.(*localFile)
+ newPath := path.Join(parent.hostPath, name)
+ if err := os.Rename(l.hostPath, newPath); err != nil {
+ return extractErrno(err)
+ }
+
+ // Update path on success.
+ // TODO: this doesn't cover cases where any of the
+ // parents have been renamed.
+ l.hostPath = newPath
+ return nil
+}
+
+// RenameAt implements p9.File.RenameAt.
+//
+// Code still uses [deprecated] Rename().
+func (*localFile) RenameAt(_ string, _ p9.File, _ string) error {
+ return syscall.ENOSYS
+}
+
+// ReadAt implements p9.File.
+func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
+ if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+ return 0, syscall.EBADF
+ }
+ if l.openedFile == nil {
+ return 0, syscall.EBADF
+ }
+
+ r, err := l.openedFile.ReadAt(p, int64(offset))
+ switch err {
+ case nil, io.EOF:
+ return r, nil
+ default:
+ return r, extractErrno(err)
+ }
+}
+
+// WriteAt implements p9.File.
+func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
+ if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+ return 0, syscall.EBADF
+ }
+ if l.openedFile == nil {
+ return 0, syscall.EBADF
+ }
+
+ w, err := l.openedFile.WriteAt(p, int64(offset))
+ if err != nil {
+ return w, extractErrno(err)
+ }
+ return w, nil
+}
+
+// Symlink implements p9.File.
+func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ if l.conf.ROMount {
+ return p9.QID{}, syscall.EBADF
+ }
+ if !isNameValid(newName) {
+ return p9.QID{}, syscall.EINVAL
+ }
+
+ if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+
+ // Open symlink to change ownership and stat it.
+ fd, err := syscall.Openat(l.controlFD(), newName, unix.O_PATH|openFlags, 0)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ defer syscall.Close(fd)
+
+ if err := fchown(fd, uid, gid); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ stat, err := stat(fd)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ return makeQID(stat), nil
+}
+
+// Link implements p9.File.
+func (l *localFile) Link(target p9.File, newName string) error {
+ if l.conf.ROMount {
+ return syscall.EBADF
+ }
+ if !isNameValid(newName) {
+ return syscall.EINVAL
+ }
+
+ targetFile := target.(*localFile)
+ if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// Mknod implements p9.File.
+//
+// Not implemented.
+func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+ return p9.QID{}, syscall.ENOSYS
+}
+
+// UnlinkAt implements p9.File.
+func (l *localFile) UnlinkAt(name string, flags uint32) error {
+ if l.conf.ROMount {
+ return syscall.EBADF
+ }
+ if !isNameValid(name) {
+ return syscall.EINVAL
+ }
+ if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil {
+ return extractErrno(err)
+ }
+ return nil
+}
+
+// Readdir implements p9.File.
+func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
+ if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+ return nil, syscall.EBADF
+ }
+ if l.openedFile == nil {
+ return nil, syscall.EBADF
+ }
+
+ // Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
+ // reading all directory contents. Take a lock because this operation is stateful.
+ l.readDirMu.Lock()
+ if _, err := l.openedFile.Seek(0, 0); err != nil {
+ l.readDirMu.Unlock()
+ return nil, extractErrno(err)
+ }
+ names, err := l.openedFile.Readdirnames(-1)
+ if err != nil {
+ l.readDirMu.Unlock()
+ return nil, extractErrno(err)
+ }
+ l.readDirMu.Unlock()
+
+ var dirents []p9.Dirent
+ for i := int(offset); i >= 0 && i < len(names); i++ {
+ stat, err := statAt(l.openedFD(), names[i])
+ if err != nil {
+ continue
+ }
+ qid := makeQID(stat)
+ dirents = append(dirents, p9.Dirent{
+ QID: qid,
+ Type: qid.Type,
+ Name: names[i],
+ Offset: uint64(i + 1),
+ })
+ }
+ return dirents, nil
+}
+
+// Readlink implements p9.File.
+func (l *localFile) Readlink() (string, error) {
+ // Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
+ for len := 128; len < 1024*1024; len *= 2 {
+ b := make([]byte, len)
+ n, err := unix.Readlinkat(l.controlFD(), "", b)
+ if err != nil {
+ return "", extractErrno(err)
+ }
+ if n < len {
+ return string(b[:n]), nil
+ }
+ }
+ return "", syscall.ENOMEM
+}
+
+// Flush implements p9.File.
+func (l *localFile) Flush() error {
+ return nil
+}
+
+// Connect implements p9.File.
+func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
+ return nil, syscall.ECONNREFUSED
+}
+
+// Close implements p9.File.
+func (l *localFile) Close() error {
+ err := l.controlFile.Close()
+
+ // Close only once in case opened and control files point to
+ // the same os.File struct.
+ if l.openedFile != nil && l.openedFile != l.controlFile {
+ err = l.openedFile.Close()
+ }
+
+ l.openedFile = nil
+ l.controlFile = nil
+ l.mode = invalidMode
+ return err
+}
+
+// extractErrno tries to determine the errno.
+func extractErrno(err error) syscall.Errno {
+ if err == nil {
+ // This should never happen. The likely result will be that
+ // some user gets the frustration "error: SUCCESS" message.
+ log.Warningf("extractErrno called with nil error!")
+ return 0
+ }
+
+ switch err {
+ case os.ErrNotExist:
+ return syscall.ENOENT
+ case os.ErrExist:
+ return syscall.EEXIST
+ case os.ErrPermission:
+ return syscall.EACCES
+ case os.ErrInvalid:
+ return syscall.EINVAL
+ }
+
+ // See if it's an errno or a common wrapped error.
+ switch e := err.(type) {
+ case syscall.Errno:
+ return e
+ case *os.PathError:
+ return extractErrno(e.Err)
+ case *os.LinkError:
+ return extractErrno(e.Err)
+ case *os.SyscallError:
+ return extractErrno(e.Err)
+ }
+
+ // Fall back to EIO.
+ log.Debugf("Unknown error: %v, defaulting to EIO", err)
+ return syscall.EIO
+}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
new file mode 100644
index 000000000..7d834d596
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -0,0 +1,576 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "syscall"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/p9"
+)
+
+func init() {
+ log.SetLevel(log.Debug)
+
+ allConfs = append(allConfs, rwConfs...)
+ allConfs = append(allConfs, roConfs...)
+}
+
+var (
+ allTypes = []fileType{regular, directory, symlink}
+
+ // allConfs is set in init() above.
+ allConfs []Config
+
+ rwConfs = []Config{
+ Config{ROMount: false, LazyOpenForWrite: false},
+ Config{ROMount: false, LazyOpenForWrite: true},
+ }
+ roConfs = []Config{
+ Config{ROMount: true, LazyOpenForWrite: false},
+ Config{ROMount: true, LazyOpenForWrite: true},
+ }
+)
+
+type state struct {
+ root *localFile
+ file *localFile
+ conf Config
+ ft fileType
+}
+
+func (s state) String() string {
+ return fmt.Sprintf("lazyopen(%v)-%v", s.conf.LazyOpenForWrite, s.ft)
+}
+
+func runAll(t *testing.T, test func(*testing.T, state)) {
+ runCustom(t, allTypes, allConfs, test)
+}
+
+func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) {
+ for _, c := range confs {
+ t.Logf("Config: %+v", c)
+
+ for _, ft := range types {
+ t.Logf("File type: %v", ft)
+
+ path, name, err := setup(ft)
+ if err != nil {
+ t.Fatalf("%v", err)
+ }
+ defer os.RemoveAll(path)
+
+ a := NewAttachPoint(path, c)
+ root, err := a.Attach("/")
+ if err != nil {
+ t.Fatalf("Attach(%q) failed, err: %v", "/", err)
+ }
+
+ _, file, err := root.Walk([]string{name})
+ if err != nil {
+ root.Close()
+ t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
+ }
+
+ st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft}
+ test(t, st)
+ file.Close()
+ root.Close()
+ }
+ }
+}
+
+func setup(ft fileType) (string, string, error) {
+ path, err := ioutil.TempDir("", "root-")
+ if err != nil {
+ return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err)
+ }
+
+ // First attach with writable configuiration to setup tree.
+ a := NewAttachPoint(path, Config{})
+ root, err := a.Attach("/")
+ if err != nil {
+ return "", "", fmt.Errorf("Attach(%q) failed, err: %v", "/", err)
+ }
+ defer root.Close()
+
+ var name string
+ switch ft {
+ case regular:
+ name = "file"
+ _, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+ if err != nil {
+ return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
+ }
+ defer f.Close()
+ case directory:
+ name = "dir"
+ if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+ return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
+ }
+ case symlink:
+ name = "symlink"
+ if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+ return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
+ }
+ default:
+ panic(fmt.Sprintf("unknown file type %v", ft))
+ }
+ return path, name, nil
+}
+
+func createFile(dir *localFile, name string) (*localFile, error) {
+ _, f, _, _, err := dir.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+ if err != nil {
+ return nil, err
+ }
+ return f.(*localFile), nil
+}
+
+func TestReadWrite(t *testing.T) {
+ runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ child, err := createFile(s.file, "test")
+ if err != nil {
+ t.Fatalf("%v: createFile() failed, err: %v", s, err)
+ }
+ defer child.Close()
+ b := []byte("foobar")
+ w, err := child.WriteAt(b, 0)
+ if err != nil {
+ t.Fatalf("%v: Write() failed, err: %v", s, err)
+ }
+ if w != len(b) {
+ t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(b))
+ }
+ for _, test := range []struct {
+ flags p9.OpenFlags
+ read bool
+ write bool
+ }{
+ {flags: p9.ReadOnly, read: true, write: false},
+ {flags: p9.WriteOnly, read: false, write: true},
+ {flags: p9.ReadWrite, read: true, write: true},
+ } {
+ _, l, err := s.file.Walk([]string{"test"})
+ if err != nil {
+ t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
+ }
+ if _, _, _, err := l.Open(test.flags); err != nil {
+ t.Fatalf("%v: Open(%v) failed, err: %v", s, test.flags, err)
+ }
+
+ w, err = l.WriteAt(b, 0)
+ if test.write {
+ if err != nil {
+ t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+ }
+ if w != len(b) {
+ t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b))
+ }
+ } else {
+ if err == nil {
+ t.Fatalf("%v, %v: WriteAt() should have failed", s, test.flags)
+ }
+ }
+
+ rBuf := make([]byte, len(b))
+ r, err := l.ReadAt(rBuf, 0)
+ if test.read {
+ if err != nil {
+ t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err)
+ }
+ if r != len(rBuf) {
+ t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf))
+ }
+ if string(rBuf) != "foobar" {
+ t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar")
+ }
+ } else {
+ if err == nil {
+ t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags)
+ }
+ }
+ }
+ })
+}
+
+func TestCreate(t *testing.T) {
+ runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ for i, test := range []struct {
+ flags p9.OpenFlags
+ read bool
+ }{
+ {flags: p9.WriteOnly, read: false},
+ {flags: p9.ReadWrite, read: true},
+ } {
+ _, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), test.flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+ if err != nil {
+ t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+ }
+
+ b := []byte("foobar")
+ w, err := l.WriteAt(b, 0)
+ if err != nil {
+ t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+ }
+ if w != len(b) {
+ t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b))
+ }
+
+ rBuf := make([]byte, len(b))
+ r, err := l.ReadAt(rBuf, 0)
+ if test.read {
+ if err != nil {
+ t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err)
+ }
+ if r != len(rBuf) {
+ t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf))
+ }
+ if string(rBuf) != "foobar" {
+ t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar")
+ }
+ } else {
+ if err == nil {
+ t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags)
+ }
+ }
+ }
+ })
+}
+
+func TestUnopened(t *testing.T) {
+ runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) {
+ b := []byte("foobar")
+ if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
+ t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF {
+ t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ if _, err := s.file.Readdir(0, 100); err != syscall.EBADF {
+ t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ if err := s.file.FSync(); err != syscall.EBADF {
+ t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ })
+}
+
+func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) {
+ if err := l.SetAttr(valid, attr); err != nil {
+ return p9.Attr{}, err
+ }
+ _, _, a, err := l.GetAttr(p9.AttrMask{})
+ if err != nil {
+ return p9.Attr{}, err
+ }
+ return a, nil
+}
+
+func TestSetAttrPerm(t *testing.T) {
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ valid := p9.SetAttrMask{Permissions: true}
+ attr := p9.SetAttr{Permissions: 0777}
+ got, err := SetGetAttr(s.file, valid, attr)
+ if s.ft == symlink {
+ if err == nil {
+ t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
+ }
+ } else {
+ if err != nil {
+ t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Permissions, err)
+ }
+ if got.Mode.Permissions() != attr.Permissions {
+ t.Errorf("%v: wrong permission, got: %v, expected: %v", s, got.Mode.Permissions(), attr.Permissions)
+ }
+ }
+ })
+}
+
+func TestSetAttrSize(t *testing.T) {
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ for _, size := range []uint64{1024, 0, 1024 * 1024} {
+ valid := p9.SetAttrMask{Size: true}
+ attr := p9.SetAttr{Size: size}
+ got, err := SetGetAttr(s.file, valid, attr)
+ if s.ft == symlink || s.ft == directory {
+ if err == nil {
+ t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
+ }
+ // Run for one size only, they will all fail the same way.
+ return
+ }
+ if err != nil {
+ t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Size, err)
+ }
+ if got.Size != size {
+ t.Errorf("%v: wrong size, got: %v, expected: %v", s, got.Size, size)
+ }
+ }
+ })
+}
+
+func TestSetAttrTime(t *testing.T) {
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ valid := p9.SetAttrMask{ATime: true, ATimeNotSystemTime: true}
+ attr := p9.SetAttr{ATimeSeconds: 123, ATimeNanoSeconds: 456}
+ got, err := SetGetAttr(s.file, valid, attr)
+ if err != nil {
+ t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.ATimeSeconds, attr.ATimeNanoSeconds, err)
+ }
+ if got.ATimeSeconds != 123 {
+ t.Errorf("%v: wrong ATimeSeconds, got: %v, expected: %v", s, got.ATimeSeconds, 123)
+ }
+ if got.ATimeNanoSeconds != 456 {
+ t.Errorf("%v: wrong ATimeNanoSeconds, got: %v, expected: %v", s, got.ATimeNanoSeconds, 456)
+ }
+
+ valid = p9.SetAttrMask{MTime: true, MTimeNotSystemTime: true}
+ attr = p9.SetAttr{MTimeSeconds: 789, MTimeNanoSeconds: 012}
+ got, err = SetGetAttr(s.file, valid, attr)
+ if err != nil {
+ t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.MTimeSeconds, attr.MTimeNanoSeconds, err)
+ }
+ if got.MTimeSeconds != 789 {
+ t.Errorf("%v: wrong MTimeSeconds, got: %v, expected: %v", s, got.MTimeSeconds, 789)
+ }
+ if got.MTimeNanoSeconds != 012 {
+ t.Errorf("%v: wrong MTimeNanoSeconds, got: %v, expected: %v", s, got.MTimeNanoSeconds, 012)
+ }
+ })
+}
+
+func TestSetAttrOwner(t *testing.T) {
+ if os.Getuid() != 0 {
+ t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid())
+ }
+
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ newUID := os.Getuid() + 1
+ valid := p9.SetAttrMask{UID: true}
+ attr := p9.SetAttr{UID: p9.UID(newUID)}
+ got, err := SetGetAttr(s.file, valid, attr)
+ if err != nil {
+ t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.UID, err)
+ }
+ if got.UID != p9.UID(newUID) {
+ t.Errorf("%v: wrong uid, got: %v, expected: %v", s, got.UID, newUID)
+ }
+ })
+}
+
+func TestLink(t *testing.T) {
+ if os.Getuid() != 0 {
+ t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid())
+ }
+ runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+ const dirName = "linkdir"
+ const linkFile = "link"
+ if _, err := s.root.Mkdir(dirName, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+ t.Fatalf("%v: MkDir(%s) failed, err: %v", s, dirName, err)
+ }
+ _, dir, err := s.root.Walk([]string{dirName})
+ if err != nil {
+ t.Fatalf("%v: Walk({%s}) failed, err: %v", s, dirName, err)
+ }
+
+ err = dir.Link(s.file, linkFile)
+ if s.ft == directory {
+ if err != syscall.EPERM {
+ t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
+ }
+ return
+ }
+ if err != nil {
+ t.Errorf("%v: Link(target, %s) failed, err: %v", s, linkFile, err)
+ }
+ })
+}
+
+func TestROMountChecks(t *testing.T) {
+ runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
+ if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+ t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+ t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ if err := s.file.Rename(s.file, ".."); err != syscall.EBADF {
+ t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+ t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ if err := s.file.UnlinkAt("..", 0); err != syscall.EBADF {
+ t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ if err := s.file.Link(s.file, ".."); err != syscall.EBADF {
+ t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+
+ valid := p9.SetAttrMask{Size: true}
+ attr := p9.SetAttr{Size: 0}
+ if err := s.file.SetAttr(valid, attr); err != syscall.EBADF {
+ t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ }
+ })
+}
+
+func TestInvalidName(t *testing.T) {
+ runCustom(t, []fileType{regular}, rwConfs, func(t *testing.T, s state) {
+ if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+ t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+ }
+ if _, _, err := s.file.Walk([]string{".."}); err != syscall.EINVAL {
+ t.Errorf("%v: Walk() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+ }
+ if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+ t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+ }
+ if err := s.file.Rename(s.file, ".."); err != syscall.EINVAL {
+ t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+ }
+ if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+ t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+ }
+ if err := s.file.UnlinkAt("..", 0); err != syscall.EINVAL {
+ t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+ }
+ if err := s.file.Link(s.file, ".."); err != syscall.EINVAL {
+ t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+ }
+ })
+}
+
+func TestIsNameValid(t *testing.T) {
+ valid := []string{
+ "name",
+ "123",
+ "!@#$%^&*()",
+ ".name",
+ "..name",
+ "...",
+ }
+ for _, s := range valid {
+ if got := isNameValid(s); !got {
+ t.Errorf("isNameValid(%s) failed, got: %v, expected: true", s, got)
+ }
+ }
+ invalid := []string{
+ ".",
+ "..",
+ "name/name",
+ "/name",
+ "name/",
+ }
+ for _, s := range invalid {
+ if got := isNameValid(s); got {
+ t.Errorf("isNameValid(%s) failed, got: %v, expected: false", s, got)
+ }
+ }
+}
+
+func TestWalkNotFound(t *testing.T) {
+ runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
+ if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
+ t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
+ }
+ })
+}
+
+func TestWalkDup(t *testing.T) {
+ runAll(t, func(t *testing.T, s state) {
+ _, dup, err := s.file.Walk([]string{})
+ if err != nil {
+ t.Fatalf("%v: Walk(nil) failed, err: %v", s, err)
+ }
+ // Check that 'dup' is usable.
+ if _, _, _, err := dup.GetAttr(p9.AttrMask{}); err != nil {
+ t.Errorf("%v: GetAttr() failed, err: %v", s, err)
+ }
+ })
+}
+
+func TestReaddir(t *testing.T) {
+ runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ name := "dir"
+ if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+ t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
+ }
+ name = "symlink"
+ if _, err := s.file.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+ t.Fatalf("%v: Symlink(%q) failed, err: %v", s, name, err)
+ }
+ name = "file"
+ _, f, _, _, err := s.file.Create(name, p9.ReadWrite, 0555, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+ if err != nil {
+ t.Fatalf("%v: createFile(root, %q) failed, err: %v", s, name, err)
+ }
+ f.Close()
+
+ if _, _, _, err := s.file.Open(p9.ReadOnly); err != nil {
+ t.Fatalf("%v: Open(ReadOnly) failed, err: %v", s, err)
+ }
+
+ dirents, err := s.file.Readdir(0, 10)
+ if err != nil {
+ t.Fatalf("%v: Readdir(0, 10) failed, err: %v", s, err)
+ }
+ if len(dirents) != 3 {
+ t.Fatalf("%v: Readdir(0, 10) wrong number of items, got: %v, expected: 3", s, len(dirents))
+ }
+ var dir, symlink, file bool
+ for _, d := range dirents {
+ switch d.Name {
+ case "dir":
+ if d.Type != p9.TypeDir {
+ t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeDir)
+ }
+ dir = true
+ case "symlink":
+ if d.Type != p9.TypeSymlink {
+ t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeSymlink)
+ }
+ symlink = true
+ case "file":
+ if d.Type != p9.TypeRegular {
+ t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeRegular)
+ }
+ file = true
+ default:
+ t.Errorf("%v: dirent.Name got: %v", s, d.Name)
+ }
+
+ _, f, err := s.file.Walk([]string{d.Name})
+ if err != nil {
+ t.Fatalf("%v: Walk({%s}) failed, err: %v", s, d.Name, err)
+ }
+ _, _, a, err := f.GetAttr(p9.AttrMask{})
+ if err != nil {
+ t.Fatalf("%v: GetAttr() failed, err: %v", s, err)
+ }
+ if d.Type != a.Mode.QIDType() {
+ t.Errorf("%v: dirent.Type different than GetAttr().Mode.QIDType(), got: %v, expected: %v", s, d.Type, a.Mode.QIDType())
+ }
+ }
+ if !dir || !symlink || !file {
+ t.Errorf("%v: Readdir(0, 10) wrong files returned, dir: %v, symlink: %v, file: %v", s, dir, symlink, file)
+ }
+ })
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
new file mode 100644
index 000000000..e676809ac
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -0,0 +1,58 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+ nameBytes, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return syscall.Stat_t{}, extractErrno(err)
+ }
+ namePtr := uintptr(unsafe.Pointer(nameBytes))
+
+ var stat syscall.Stat_t
+ statPtr := uintptr(unsafe.Pointer(&stat))
+
+ if _, _, err := syscall.Syscall6(syscall.SYS_NEWFSTATAT, uintptr(dirFd), namePtr, statPtr, linux.AT_SYMLINK_NOFOLLOW, 0, 0); err != 0 {
+ return syscall.Stat_t{}, err
+ }
+ return stat, nil
+}
+
+func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+ // utimensat(2) doesn't accept empty name, instead name must be nil to make it
+ // operate directly on 'dirFd' unlike other *at syscalls.
+ var namePtr uintptr
+ if name != "" {
+ nameBytes, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return extractErrno(err)
+ }
+ namePtr = uintptr(unsafe.Pointer(nameBytes))
+ }
+
+ timesPtr := uintptr(unsafe.Pointer(&times[0]))
+
+ if _, _, err := syscall.Syscall6(syscall.SYS_UTIMENSAT, uintptr(dirFd), namePtr, timesPtr, uintptr(flags), 0, 0); err != 0 {
+ return err
+ }
+ return nil
+}
diff --git a/runsc/main.go b/runsc/main.go
new file mode 100644
index 000000000..cf4b99d3f
--- /dev/null
+++ b/runsc/main.go
@@ -0,0 +1,199 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary runsc is an implementation of the Open Container Initiative Runtime
+// that runs applications inside a sandbox.
+package main
+
+import (
+ "fmt"
+ "io"
+ "os"
+ "path/filepath"
+ "strings"
+ "syscall"
+ "time"
+
+ "context"
+ "flag"
+
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/cmd"
+)
+
+var (
+ // Although these flags are not part of the OCI spec, they are used by
+ // Docker, and thus should not be changed.
+ rootDir = flag.String("root", "", "root directory for storage of container state")
+ logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
+ logFormat = flag.String("log-format", "text", "log format: text (default) or json")
+ debug = flag.Bool("debug", false, "enable debug logging")
+
+ // These flags are unique to runsc, and are used to configure parts of the
+ // system that are not covered by the runtime spec.
+
+ // Debugging flags.
+ debugLogDir = flag.String("debug-log-dir", "", "additional location for logs. It creates individual log files per command")
+ logPackets = flag.Bool("log-packets", false, "enable network packet logging")
+
+ // Debugging flags: strace related
+ strace = flag.Bool("strace", false, "enable strace")
+ straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+ straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
+
+ // Flags that control sandbox runtime behavior.
+ platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+ network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+ fileAccess = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.")
+ overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+)
+
+func main() {
+ // Help and flags commands are generated automatically.
+ subcommands.Register(subcommands.HelpCommand(), "")
+ subcommands.Register(subcommands.FlagsCommand(), "")
+
+ // Register user-facing runsc commands.
+ subcommands.Register(new(cmd.Create), "")
+ subcommands.Register(new(cmd.Delete), "")
+ subcommands.Register(new(cmd.Events), "")
+ subcommands.Register(new(cmd.Exec), "")
+ subcommands.Register(new(cmd.Gofer), "")
+ subcommands.Register(new(cmd.Kill), "")
+ subcommands.Register(new(cmd.List), "")
+ subcommands.Register(new(cmd.PS), "")
+ subcommands.Register(new(cmd.Run), "")
+ subcommands.Register(new(cmd.Start), "")
+ subcommands.Register(new(cmd.State), "")
+
+ // Register internal commands with the internal group name. This causes
+ // them to be sorted below the user-facing commands with empty group.
+ // The string below will be printed above the commands.
+ const internalGroup = "internal use only"
+ subcommands.Register(new(cmd.Boot), internalGroup)
+ subcommands.Register(new(cmd.Gofer), internalGroup)
+
+ // All subcommands must be registered before flag parsing.
+ flag.Parse()
+
+ platformType, err := boot.MakePlatformType(*platform)
+ if err != nil {
+ cmd.Fatalf("%v", err)
+ }
+
+ fsAccess, err := boot.MakeFileAccessType(*fileAccess)
+ if err != nil {
+ cmd.Fatalf("%v", err)
+ }
+
+ netType, err := boot.MakeNetworkType(*network)
+ if err != nil {
+ cmd.Fatalf("%v", err)
+ }
+
+ // Create a new Config from the flags.
+ conf := &boot.Config{
+ RootDir: *rootDir,
+ FileAccess: fsAccess,
+ Overlay: *overlay,
+ Network: netType,
+ LogPackets: *logPackets,
+ Platform: platformType,
+ Strace: *strace,
+ StraceLogSize: *straceLogSize,
+ }
+ if len(*straceSyscalls) != 0 {
+ conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
+ }
+
+ // Set up logging.
+ if *debug {
+ log.SetLevel(log.Debug)
+ }
+
+ var logFile io.Writer = os.Stderr
+ if *logFilename != "" {
+ f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
+ if err != nil {
+ cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+ }
+ logFile = f
+ }
+
+ var e log.Emitter
+ switch *logFormat {
+ case "text":
+ e = log.GoogleEmitter{&log.Writer{Next: logFile}}
+ case "json":
+ e = log.JSONEmitter{log.Writer{Next: logFile}}
+ default:
+ cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat)
+ }
+
+ if *debugLogDir != "" {
+ if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
+ cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err)
+ }
+
+ // Format: <debug-log-dir>/runsc.log.<yyymmdd-hhmmss.uuuuuu>.<command>
+ scmd := flag.CommandLine.Arg(0)
+ filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), scmd)
+ path := filepath.Join(*debugLogDir, filename)
+ f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+ if err != nil {
+ cmd.Fatalf("error opening log file %q: %v", filename, err)
+ }
+ e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
+ }
+
+ log.SetTarget(e)
+
+ log.Infof("***************************")
+ log.Infof("Args: %s", os.Args)
+ log.Infof("PID: %d", os.Getpid())
+ log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
+ log.Infof("Configuration:")
+ log.Infof("\t\tRootDir: %s", conf.RootDir)
+ log.Infof("\t\tPlatform: %v", conf.Platform)
+ log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
+ log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
+ log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+ log.Infof("***************************")
+
+ // Call the subcommand and pass in the configuration.
+ var ws syscall.WaitStatus
+ subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+ if subcmdCode == subcommands.ExitSuccess {
+ log.Infof("Exiting with status: %v", ws)
+ if ws.Signaled() {
+ // No good way to return it, emulate what the shell does. Maybe raise
+ // signall to self?
+ os.Exit(128 + int(ws.Signal()))
+ }
+ os.Exit(ws.ExitStatus())
+ }
+ // Return an error that is unlikely to be used by the application.
+ log.Warningf("Failure to execute command, err: %v", subcmdCode)
+ os.Exit(128)
+}
+
+func init() {
+ // Set default root dir to something (hopefully) user-writeable.
+ *rootDir = "/var/run/runsc"
+ if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+ *rootDir = filepath.Join(runtimeDir, "runsc")
+ }
+}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
new file mode 100644
index 000000000..bdd95903e
--- /dev/null
+++ b/runsc/sandbox/BUILD
@@ -0,0 +1,53 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "sandbox",
+ srcs = [
+ "console.go",
+ "hook.go",
+ "namespace.go",
+ "network.go",
+ "sandbox.go",
+ "status.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/runsc/sandbox",
+ visibility = [
+ "//runsc:__subpackages__",
+ ],
+ deps = [
+ "//pkg/control/client",
+ "//pkg/control/server",
+ "//pkg/log",
+ "//pkg/sentry/control",
+ "//pkg/urpc",
+ "//runsc/boot",
+ "//runsc/specutils",
+ "@com_github_kr_pty//:go_default_library",
+ "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_vishvananda_netlink//:go_default_library",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+go_test(
+ name = "sandbox_test",
+ size = "small",
+ srcs = ["sandbox_test.go"],
+ pure = "on",
+ rundir = ".",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/log",
+ "//pkg/sentry/control",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/unet",
+ "//runsc/boot",
+ "//runsc/cmd",
+ "//runsc/sandbox",
+ "@com_github_google_subcommands//:go_default_library",
+ "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
diff --git a/runsc/sandbox/console.go b/runsc/sandbox/console.go
new file mode 100644
index 000000000..3f133e12a
--- /dev/null
+++ b/runsc/sandbox/console.go
@@ -0,0 +1,60 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+ "fmt"
+ "net"
+ "os"
+
+ "github.com/kr/pty"
+ "golang.org/x/sys/unix"
+)
+
+// setupConsole creates pty master/slave pair, sends the master FD over the
+// given socket, and returns the slave.
+func setupConsole(socketPath string) (*os.File, error) {
+ // Create a new pty master and slave.
+ ptyMaster, ptySlave, err := pty.Open()
+ if err != nil {
+ return nil, fmt.Errorf("error opening pty: %v", err)
+ }
+ defer ptyMaster.Close()
+
+ // Get a connection to the socket path.
+ conn, err := net.Dial("unix", socketPath)
+ if err != nil {
+ ptySlave.Close()
+ return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
+ }
+ uc, ok := conn.(*net.UnixConn)
+ if !ok {
+ ptySlave.Close()
+ return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
+ }
+ socket, err := uc.File()
+ if err != nil {
+ ptySlave.Close()
+ return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
+ }
+
+ // Send the master FD over the connection.
+ msg := unix.UnixRights(int(ptyMaster.Fd()))
+ if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
+ ptySlave.Close()
+ return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err)
+ }
+ return ptySlave, nil
+}
diff --git a/runsc/sandbox/hook.go b/runsc/sandbox/hook.go
new file mode 100644
index 000000000..40b064cdc
--- /dev/null
+++ b/runsc/sandbox/hook.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "os/exec"
+ "path/filepath"
+ "strings"
+ "time"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// This file implements hooks as defined in OCI spec:
+// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22
+//
+// "hooks":{
+// "prestart":[{
+// "path":"/usr/bin/dockerd",
+// "args":[
+// "libnetwork-setkey", "arg2",
+// ]
+// }]
+// },
+
+// executeHooksBestEffort executes hooks and logs warning in case they fail.
+// Runs all hooks, always.
+func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
+ for _, h := range hooks {
+ if err := executeHook(h, s); err != nil {
+ log.Warningf("Failure to execute hook %+v, err: %v", h, err)
+ }
+ }
+}
+
+// executeHooks executes hooks until the first one fails or they all execute.
+func executeHooks(hooks []specs.Hook, s specs.State) error {
+ for _, h := range hooks {
+ if err := executeHook(h, s); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func executeHook(h specs.Hook, s specs.State) error {
+ log.Debugf("Executing hook %+v, state: %+v", h, s)
+
+ if strings.TrimSpace(h.Path) == "" {
+ return fmt.Errorf("empty path for hook")
+ }
+ if !filepath.IsAbs(h.Path) {
+ return fmt.Errorf("path for hook is not absolute: %q", h.Path)
+ }
+
+ b, err := json.Marshal(s)
+ if err != nil {
+ return err
+ }
+ var stdout, stderr bytes.Buffer
+ cmd := exec.Cmd{
+ Path: h.Path,
+ Args: h.Args,
+ Env: h.Env,
+ Stdin: bytes.NewReader(b),
+ Stdout: &stdout,
+ Stderr: &stderr,
+ }
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+
+ c := make(chan error, 1)
+ go func() {
+ c <- cmd.Wait()
+ }()
+
+ var timer <-chan time.Time
+ if h.Timeout != nil {
+ timer = time.After(time.Duration(*h.Timeout) * time.Second)
+ }
+ select {
+ case err := <-c:
+ if err != nil {
+ return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String())
+ }
+ case <-timer:
+ cmd.Process.Kill()
+ cmd.Wait()
+ return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String())
+ }
+
+ log.Debugf("Execute hook %q success!", h.Path)
+ return nil
+}
diff --git a/runsc/sandbox/namespace.go b/runsc/sandbox/namespace.go
new file mode 100644
index 000000000..1d3bcfbb5
--- /dev/null
+++ b/runsc/sandbox/namespace.go
@@ -0,0 +1,204 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+ "fmt"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// nsCloneFlag returns the clone flag that can be used to set a namespace of
+// the given type.
+func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
+ switch nst {
+ case specs.IPCNamespace:
+ return syscall.CLONE_NEWIPC
+ case specs.MountNamespace:
+ return syscall.CLONE_NEWNS
+ case specs.NetworkNamespace:
+ return syscall.CLONE_NEWNET
+ case specs.PIDNamespace:
+ return syscall.CLONE_NEWPID
+ case specs.UTSNamespace:
+ return syscall.CLONE_NEWUTS
+ case specs.UserNamespace:
+ return syscall.CLONE_NEWUSER
+ case specs.CgroupNamespace:
+ panic("cgroup namespace has no associated clone flag")
+ default:
+ panic(fmt.Sprintf("unknown namespace %v", nst))
+ }
+}
+
+// nsPath returns the path of the namespace for the current process and the
+// given namespace.
+func nsPath(nst specs.LinuxNamespaceType) string {
+ base := "/proc/self/ns"
+ switch nst {
+ case specs.CgroupNamespace:
+ return filepath.Join(base, "cgroup")
+ case specs.IPCNamespace:
+ return filepath.Join(base, "ipc")
+ case specs.MountNamespace:
+ return filepath.Join(base, "mnt")
+ case specs.NetworkNamespace:
+ return filepath.Join(base, "net")
+ case specs.PIDNamespace:
+ return filepath.Join(base, "pid")
+ case specs.UserNamespace:
+ return filepath.Join(base, "user")
+ case specs.UTSNamespace:
+ return filepath.Join(base, "uts")
+ default:
+ panic(fmt.Sprintf("unknown namespace %v", nst))
+ }
+}
+
+// getNS returns true and the namespace with the given type from the slice of
+// namespaces in the spec. It returns false if the slice does not contain a
+// namespace with the type.
+func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
+ if s.Linux == nil {
+ return specs.LinuxNamespace{}, false
+ }
+ for _, ns := range s.Linux.Namespaces {
+ if ns.Type == nst {
+ return ns, true
+ }
+ }
+ return specs.LinuxNamespace{}, false
+}
+
+// filterNS returns a slice of namespaces from the spec with types that match
+// those in the `filter` slice.
+func filterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
+ if s.Linux == nil {
+ return nil
+ }
+ var out []specs.LinuxNamespace
+ for _, nst := range filter {
+ if ns, ok := getNS(nst, s); ok {
+ out = append(out, ns)
+ }
+ }
+ return out
+}
+
+// setNS sets the namespace of the given type. It must be called with
+// OSThreadLocked.
+func setNS(fd, nsType uintptr) error {
+ if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
+ return err
+ }
+ return nil
+}
+
+// applyNS applies the namespace on the current thread and returns a function
+// that will restore the namespace to the original value.
+//
+// Preconditions: Must be called with os thread locked.
+func applyNS(ns specs.LinuxNamespace) (func(), error) {
+ log.Infof("applying namespace %v at path %q", ns.Type, ns.Path)
+ newNS, err := os.Open(ns.Path)
+ if err != nil {
+ return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
+ }
+ defer newNS.Close()
+
+ // Store current netns to restore back after child is started.
+ curPath := nsPath(ns.Type)
+ oldNS, err := os.Open(curPath)
+ if err != nil {
+ return nil, fmt.Errorf("error opening %q: %v", curPath, err)
+ }
+
+ // Set netns to the one requested and setup function to restore it back.
+ flag := nsCloneFlag(ns.Type)
+ if err := setNS(newNS.Fd(), flag); err != nil {
+ oldNS.Close()
+ return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
+ }
+ return func() {
+ log.Infof("restoring namespace %v", ns.Type)
+ defer oldNS.Close()
+ if err := setNS(oldNS.Fd(), flag); err != nil {
+ panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
+ }
+ }, nil
+}
+
+// startInNS joins or creates the given namespaces and calls cmd.Start before
+// restoring the namespaces to the original values.
+func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
+ // We are about to setup namespaces, which requires the os thread being
+ // locked so that Go doesn't change the thread out from under us.
+ runtime.LockOSThread()
+ defer runtime.UnlockOSThread()
+
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+
+ for _, ns := range nss {
+ if ns.Path == "" {
+ // No path. Just set a flag to create a new namespace.
+ cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
+ continue
+ }
+ // Join the given namespace, and restore the current namespace
+ // before exiting.
+ restoreNS, err := applyNS(ns)
+ if err != nil {
+ return err
+ }
+ defer restoreNS()
+ }
+
+ return cmd.Start()
+}
+
+// setUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
+func setUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
+ if s.Linux == nil {
+ return
+ }
+ if cmd.SysProcAttr == nil {
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ }
+ for _, idMap := range s.Linux.UIDMappings {
+ log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+ cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
+ ContainerID: int(idMap.ContainerID),
+ HostID: int(idMap.HostID),
+ Size: int(idMap.Size),
+ })
+ }
+ for _, idMap := range s.Linux.GIDMappings {
+ log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+ cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
+ ContainerID: int(idMap.ContainerID),
+ HostID: int(idMap.HostID),
+ Size: int(idMap.Size),
+ })
+ }
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
new file mode 100644
index 000000000..1b6a1d9a6
--- /dev/null
+++ b/runsc/sandbox/network.go
@@ -0,0 +1,348 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+ "fmt"
+ "net"
+ "os"
+ "path/filepath"
+ "runtime"
+ "strconv"
+ "strings"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/vishvananda/netlink"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+)
+
+// setupNetwork configures the network stack to mimic the local network
+// configuration. Docker uses network namespaces with vnets to configure the
+// network for the container. The untrusted app expects to see the same network
+// inside the sandbox. Routing and port mapping is handled directly by docker
+// with most of network information not even available to the runtime.
+//
+// Netstack inside the sandbox speaks directly to the device using a raw socket.
+// All IP addresses assigned to the NIC, are removed and passed on to netstack's
+// device.
+//
+// If 'conf.Network' is NoNetwork, skips local configuration and creates a
+// loopback interface only.
+//
+// Run the following container to test it:
+// docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+ log.Infof("Setting up network")
+
+ // HACK!
+ //
+ // When kubernetes starts a pod, it first creates a sandbox with an
+ // application that just pauses forever. Later, when a container is
+ // added to the pod, kubernetes will create another sandbox with a
+ // config that corresponds to the containerized application, and add it
+ // to the same namespaces as the pause sandbox.
+ //
+ // Running a second sandbox currently breaks because the two sandboxes
+ // have the same network namespace and configuration, and try to create
+ // a tap device on the same host device which fails.
+ //
+ // Runsc will eventually need to detect that this container is meant to
+ // be run in the same sandbox as the pausing application, and somehow
+ // make that happen.
+ //
+ // For now the following HACK disables networking for the "pause"
+ // sandbox, allowing the second sandbox to start up successfully.
+ //
+ // Cri-o helpfully adds the "ContainerType" annotation that we can use
+ // to detect whether we are a pod or container. Cri-containerd will
+ // support this eventually, but does not currently
+ // (https://github.com/kubernetes-incubator/cri-containerd/issues/512).
+ //
+ // Thus, to support cri-containerd, we check if the exec args is
+ // "/pause", which is pretty gross.
+ //
+ // TODO: Remove this once multiple containers per sandbox
+ // is properly supported.
+ if spec.Annotations["io.kubernetes.cri-o.ContainerType"] == "sandbox" || spec.Process.Args[0] == "/pause" {
+ log.Warningf("HACK: Disabling network")
+ conf.Network = boot.NetworkNone
+ }
+
+ switch conf.Network {
+ case boot.NetworkNone:
+ log.Infof("Network is disabled, create loopback interface only")
+ if err := createDefaultLoopbackInterface(conn); err != nil {
+ return fmt.Errorf("error creating default loopback interface: %v", err)
+ }
+ case boot.NetworkSandbox:
+ // Build the path to the net namespace of the sandbox process.
+ // This is what we will copy.
+ nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
+ if err := createInterfacesAndRoutesFromNS(conn, nsPath); err != nil {
+ return fmt.Errorf("error creating interfaces from net namespace %q: %v", nsPath, err)
+ }
+ case boot.NetworkHost:
+ // Nothing to do here.
+ default:
+ return fmt.Errorf("Invalid network type: %d", conf.Network)
+ }
+ return nil
+}
+
+func createDefaultLoopbackInterface(conn *urpc.Client) error {
+ link := boot.LoopbackLink{
+ Name: "lo",
+ Addresses: []net.IP{
+ net.IP("\x7f\x00\x00\x01"),
+ net.IPv6loopback,
+ },
+ Routes: []boot.Route{
+ {
+ Destination: net.IP("\x7f\x00\x00\x00"),
+ Mask: net.IPMask("\xff\x00\x00\x00"),
+ },
+ {
+ Destination: net.IPv6loopback,
+ Mask: net.IPMask(strings.Repeat("\xff", 16)),
+ },
+ },
+ }
+ if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
+ LoopbackLinks: []boot.LoopbackLink{link},
+ }, nil); err != nil {
+ return fmt.Errorf("error creating loopback link and routes: %v", err)
+ }
+ return nil
+}
+
+func joinNetNS(nsPath string) (func(), error) {
+ runtime.LockOSThread()
+ restoreNS, err := applyNS(specs.LinuxNamespace{
+ Type: specs.NetworkNamespace,
+ Path: nsPath,
+ })
+ if err != nil {
+ runtime.UnlockOSThread()
+ return nil, fmt.Errorf("error joining net namespace %q: %v", nsPath, err)
+ }
+ return func() {
+ restoreNS()
+ runtime.UnlockOSThread()
+ }, nil
+}
+
+// isRootNS determines whether we are running in the root net namespace.
+//
+// TODO: Find a better way to detect root network.
+func isRootNS(ifaces []net.Interface) bool {
+ for _, iface := range ifaces {
+ if iface.Name == "docker0" {
+ return true
+ }
+ }
+ return false
+
+}
+
+// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
+// net namespace with the given path, creates them in the sandbox, and removes
+// them from the host.
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
+ // Join the network namespace that we will be copying.
+ restore, err := joinNetNS(nsPath)
+ if err != nil {
+ return err
+ }
+ defer restore()
+
+ // Get all interfaces in the namespace.
+ ifaces, err := net.Interfaces()
+ if err != nil {
+ return fmt.Errorf("error querying interfaces: %v", err)
+ }
+
+ if isRootNS(ifaces) {
+ return fmt.Errorf("cannot run in with network enabled in root network namespace")
+ }
+
+ // Collect addresses and routes from the interfaces.
+ var args boot.CreateLinksAndRoutesArgs
+ for _, iface := range ifaces {
+ if iface.Flags&net.FlagUp == 0 {
+ log.Infof("Skipping down interface: %+v", iface)
+ continue
+ }
+
+ ifaddrs, err := iface.Addrs()
+ if err != nil {
+ return fmt.Errorf("error fetching interface addresses for %q: %v", iface.Name, err)
+ }
+
+ // We build our own loopback devices.
+ if iface.Flags&net.FlagLoopback != 0 {
+ links, err := loopbackLinks(iface, ifaddrs)
+ if err != nil {
+ return fmt.Errorf("error getting loopback routes and links for iface %q: %v", iface.Name, err)
+ }
+ args.LoopbackLinks = append(args.LoopbackLinks, links...)
+ continue
+ }
+
+ // Get the link for the interface.
+ ifaceLink, err := netlink.LinkByName(iface.Name)
+ if err != nil {
+ return fmt.Errorf("error getting link for interface %q: %v", iface.Name, err)
+ }
+
+ // Create the socket.
+ const protocol = 0x0300 // htons(ETH_P_ALL)
+ fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+ if err != nil {
+ return fmt.Errorf("unable to create raw socket: %v", err)
+ }
+ deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+
+ // Bind to the appropriate device.
+ ll := syscall.SockaddrLinklayer{
+ Protocol: protocol,
+ Ifindex: ifaceLink.Attrs().Index,
+ Hatype: 0, // No ARP type.
+ Pkttype: syscall.PACKET_OTHERHOST,
+ }
+ if err := syscall.Bind(fd, &ll); err != nil {
+ return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+ }
+
+ // Scrape the routes before removing the address, since that
+ // will remove the routes as well.
+ routes, def, err := routesForIface(iface)
+ if err != nil {
+ return fmt.Errorf("error getting routes for interface %q: %v", iface.Name, err)
+ }
+ if def != nil {
+ if !args.DefaultGateway.Route.Empty() {
+ return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+ }
+ args.DefaultGateway.Route = *def
+ args.DefaultGateway.Name = iface.Name
+ }
+
+ link := boot.FDBasedLink{
+ Name: iface.Name,
+ MTU: iface.MTU,
+ Routes: routes,
+ }
+
+ // Collect the addresses for the interface, enable forwarding,
+ // and remove them from the host.
+ for _, ifaddr := range ifaddrs {
+ ipNet, ok := ifaddr.(*net.IPNet)
+ if !ok {
+ return fmt.Errorf("address is not IPNet: %t %+v", ifaddr, ifaddr)
+ }
+ link.Addresses = append(link.Addresses, ipNet.IP)
+
+ // Steal IP address from NIC.
+ if err := removeAddress(ifaceLink, ipNet.String()); err != nil {
+ return fmt.Errorf("error removing address %v from device %q: %v", iface.Name, ipNet, err)
+ }
+ }
+
+ args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
+ args.FDBasedLinks = append(args.FDBasedLinks, link)
+ }
+
+ log.Debugf("Setting up network, config: %+v", args)
+ if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
+ return fmt.Errorf("error creating links and routes: %v", err)
+ }
+ return nil
+}
+
+// loopbackLinks collects the links for a loopback interface.
+func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
+ var links []boot.LoopbackLink
+ for _, addr := range addrs {
+ ipNet, ok := addr.(*net.IPNet)
+ if !ok {
+ return nil, fmt.Errorf("address is not IPNet: %t %+v", addr, addr)
+ }
+ links = append(links, boot.LoopbackLink{
+ Name: iface.Name,
+ Addresses: []net.IP{ipNet.IP},
+ Routes: []boot.Route{{
+ Destination: ipNet.IP.Mask(ipNet.Mask),
+ Mask: ipNet.Mask,
+ }},
+ })
+ }
+ return links, nil
+}
+
+// routesForIface iterates over all routes for the given interface and converts
+// them to boot.Routes.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+ link, err := netlink.LinkByIndex(iface.Index)
+ if err != nil {
+ return nil, nil, err
+ }
+ rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
+ if err != nil {
+ return nil, nil, fmt.Errorf("error getting routes from %q: %v", iface.Name, err)
+ }
+
+ var def *boot.Route
+ var routes []boot.Route
+ for _, r := range rs {
+ // Is it a default route?
+ if r.Dst == nil {
+ if r.Gw == nil {
+ return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
+ }
+ if def != nil {
+ return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+ }
+ emptyAddr := net.IPv6zero
+ if r.Gw.To4() != nil {
+ emptyAddr = net.IPv4zero
+ }
+ // Create a catch all route to the gateway.
+ def = &boot.Route{
+ Destination: emptyAddr,
+ Mask: net.IPMask(emptyAddr),
+ Gateway: r.Gw,
+ }
+ continue
+ }
+ routes = append(routes, boot.Route{
+ Destination: r.Dst.IP.Mask(r.Dst.Mask),
+ Mask: r.Dst.Mask,
+ })
+ }
+ return routes, def, nil
+}
+
+// removeAddress removes IP address from network device. It's equivalent to:
+// ip addr del <ipAndMask> dev <name>
+func removeAddress(source netlink.Link, ipAndMask string) error {
+ addr, err := netlink.ParseAddr(ipAndMask)
+ if err != nil {
+ return err
+ }
+ return netlink.AddrDel(source, addr)
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
new file mode 100644
index 000000000..b2fa1d58e
--- /dev/null
+++ b/runsc/sandbox/sandbox.go
@@ -0,0 +1,666 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sandbox creates and manipulates sandboxes.
+package sandbox
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "regexp"
+ "strconv"
+ "syscall"
+ "time"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/control/client"
+ "gvisor.googlesource.com/gvisor/pkg/control/server"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/urpc"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// metadataFilename is the name of the metadata file relative to sandboxRoot
+// that holds sandbox metadata.
+const metadataFilename = "meta.json"
+
+// See libcontainer/factory_linux.go
+var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
+
+// validateID validates the sandbox id.
+func validateID(id string) error {
+ if !idRegex.MatchString(id) {
+ return fmt.Errorf("invalid sandbox id: %v", id)
+ }
+ return nil
+}
+
+// Sandbox wraps a child sandbox process, and is responsible for saving and
+// loading sandbox metadata to disk.
+//
+// Within a root directory, we maintain subdirectories for each sandbox named
+// with the sandbox id. The sandbox metadata is is stored as json within the
+// sandbox directoy in a file named "meta.json". This metadata format is
+// defined by us, and is not part of the OCI spec.
+//
+// Sandboxes must write this metadata file after any change to their internal
+// state. The entire sandbox directory is deleted when the sandbox is
+// destroyed.
+//
+// TODO: Protect against concurrent changes to the sandbox metadata
+// file.
+type Sandbox struct {
+ // ID is the sandbox ID.
+ ID string `json:"id"`
+
+ // Spec is the OCI runtime spec that configures this sandbox.
+ Spec *specs.Spec `json:"spec"`
+
+ // BundleDir is the directory containing the sandbox bundle.
+ BundleDir string `json:"bundleDir"`
+
+ // SandboxRoot is the directory containing the sandbox metadata file.
+ SandboxRoot string `json:"sandboxRoot"`
+
+ // CreatedAt is the time the sandbox was created.
+ CreatedAt time.Time `json:"createdAt"`
+
+ // Owner is the sandbox owner.
+ Owner string `json:"owner"`
+
+ // ConsoleSocket is the path to a unix domain socket that will receive
+ // the console FD. It is only used during create, so we don't need to
+ // store it in the metadata.
+ ConsoleSocket string `json:"-"`
+
+ // Pid is the pid of the running sandbox. Only valid if Status is
+ // Created or Running.
+ Pid int `json:"pid"`
+
+ // GoferPid is the pid of the gofer running along side the sandbox. May be 0
+ // if the gofer has been killed or it's not being used.
+ GoferPid int `json:"goferPid"`
+
+ // Status is the current sandbox Status.
+ Status Status `json:"status"`
+}
+
+// Create creates the sandbox subprocess and writes the metadata file. Args
+// are additional arguments that will be passed to the sandbox process.
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (*Sandbox, error) {
+ log.Debugf("Create sandbox %q in root dir: %s", id, conf.RootDir)
+ if err := validateID(id); err != nil {
+ return nil, err
+ }
+
+ sandboxRoot := filepath.Join(conf.RootDir, id)
+ if exists(sandboxRoot) {
+ return nil, fmt.Errorf("sandbox with id %q already exists: %q ", id, sandboxRoot)
+ }
+
+ s := &Sandbox{
+ ID: id,
+ Spec: spec,
+ ConsoleSocket: consoleSocket,
+ BundleDir: bundleDir,
+ SandboxRoot: sandboxRoot,
+ Status: Creating,
+ Owner: os.Getenv("USER"),
+ }
+
+ // Create sandbox process. If anything errors between now and the end of this
+ // function, we MUST clean up all sandbox resources.
+ if err := s.createProcesses(conf, args); err != nil {
+ s.Destroy()
+ return nil, err
+ }
+
+ // Wait for the control server to come up (or timeout). The sandbox is
+ // not "created" until that happens.
+ if err := s.waitForCreated(10 * time.Second); err != nil {
+ s.Destroy()
+ return nil, err
+ }
+
+ s.Status = Created
+ s.CreatedAt = time.Now()
+
+ // Save the metadata file.
+ if err := s.save(); err != nil {
+ s.Destroy()
+ return nil, err
+ }
+
+ // Write the pid file. Containerd consideres the create complete after
+ // this file is created, so it must be the last thing we do.
+ if pidFile != "" {
+ if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(s.Pid)), 0644); err != nil {
+ s.Destroy()
+ return nil, fmt.Errorf("error writing pid file: %v", err)
+ }
+ }
+
+ return s, nil
+}
+
+// Run is a helper that calls Create + Start + Wait.
+func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (syscall.WaitStatus, error) {
+ s, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, args)
+ if err != nil {
+ return 0, fmt.Errorf("error creating sandbox: %v", err)
+ }
+ if err := s.Start(conf); err != nil {
+ return 0, fmt.Errorf("error starting sandbox: %v", err)
+ }
+ return s.Wait()
+}
+
+// Load loads a sandbox from with the given id from a metadata file.
+func Load(rootDir, id string) (*Sandbox, error) {
+ log.Debugf("Load sandbox %q %q", rootDir, id)
+ if err := validateID(id); err != nil {
+ return nil, err
+ }
+ sandboxRoot := filepath.Join(rootDir, id)
+ if !exists(sandboxRoot) {
+ return nil, fmt.Errorf("sandbox with id %q does not exist", id)
+ }
+ metaFile := filepath.Join(sandboxRoot, metadataFilename)
+ if !exists(metaFile) {
+ return nil, fmt.Errorf("sandbox with id %q does not have metadata file %q", id, metaFile)
+ }
+ metaBytes, err := ioutil.ReadFile(metaFile)
+ if err != nil {
+ return nil, fmt.Errorf("error reading sandbox metadata file %q: %v", metaFile, err)
+ }
+ var s Sandbox
+ if err := json.Unmarshal(metaBytes, &s); err != nil {
+ return nil, fmt.Errorf("error unmarshaling sandbox metadata from %q: %v", metaFile, err)
+ }
+
+ // If the status is "Running" or "Created", check that the process
+ // still exists, and set it to Stopped if it does not.
+ //
+ // This is inherintly racey.
+ if s.Status == Running || s.Status == Created {
+ // Send signal 0 to check if process exists.
+ if err := s.Signal(0); err != nil {
+ // Process no longer exists.
+ s.Status = Stopped
+ s.Pid = 0
+ }
+ }
+
+ return &s, nil
+}
+
+// List returns all sandbox ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+ log.Debugf("List sandboxes %q", rootDir)
+ fs, err := ioutil.ReadDir(rootDir)
+ if err != nil {
+ return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err)
+ }
+ var out []string
+ for _, f := range fs {
+ out = append(out, f.Name())
+ }
+ return out, nil
+}
+
+// State returns the metadata of the sandbox.
+func (s *Sandbox) State() specs.State {
+ return specs.State{
+ Version: specs.Version,
+ ID: s.ID,
+ Status: s.Status.String(),
+ Pid: s.Pid,
+ Bundle: s.BundleDir,
+ }
+}
+
+// Start starts running the containerized process inside the sandbox.
+func (s *Sandbox) Start(conf *boot.Config) error {
+ log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid)
+ if s.Status != Created {
+ return fmt.Errorf("cannot start container in state %s", s.Status)
+ }
+
+ // "If any prestart hook fails, the runtime MUST generate an error,
+ // stop and destroy the container".
+ if s.Spec.Hooks != nil {
+ if err := executeHooks(s.Spec.Hooks.Prestart, s.State()); err != nil {
+ s.Destroy()
+ return err
+ }
+ }
+
+ c, err := s.connect()
+ if err != nil {
+ s.Destroy()
+ return err
+ }
+ defer c.Close()
+
+ // Configure the network.
+ if err := setupNetwork(c, s.Pid, s.Spec, conf); err != nil {
+ s.Destroy()
+ return fmt.Errorf("error setting up network: %v", err)
+ }
+
+ // Send a message to the sandbox control server to start the
+ // application.
+ if err := c.Call(boot.ApplicationStart, nil, nil); err != nil {
+ s.Destroy()
+ return fmt.Errorf("error starting sandbox: %v", err)
+ }
+
+ // "If any poststart hook fails, the runtime MUST log a warning, but
+ // the remaining hooks and lifecycle continue as if the hook had
+ // succeeded".
+ if s.Spec.Hooks != nil {
+ executeHooksBestEffort(s.Spec.Hooks.Poststart, s.State())
+ }
+
+ s.Status = Running
+ return s.save()
+}
+
+// Processes retrieves the list of processes and associated metadata inside a
+// sandbox.
+func (s *Sandbox) Processes() ([]*control.Process, error) {
+ if s.Status != Running {
+ return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", s.ID, s.Status)
+ }
+
+ c, err := s.connect()
+ if err != nil {
+ return nil, err
+ }
+ defer c.Close()
+
+ var pl []*control.Process
+ if err := c.Call(boot.ApplicationProcesses, nil, &pl); err != nil {
+ return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
+ }
+ return pl, nil
+}
+
+// Execute runs the specified command in the sandbox.
+func (s *Sandbox) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) {
+ log.Debugf("Execute in sandbox %q, pid: %d, args: %+v", s.ID, s.Pid, e)
+ if s.Status != Created && s.Status != Running {
+ return 0, fmt.Errorf("cannot exec in container in state %s", s.Status)
+ }
+
+ log.Debugf("Connecting to sandbox...")
+ c, err := s.connect()
+ if err != nil {
+ return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+ }
+ defer c.Close()
+
+ // Send a message to the sandbox control server to start the application.
+ var waitStatus uint32
+ if err := c.Call(boot.ApplicationExecute, e, &waitStatus); err != nil {
+ return 0, fmt.Errorf("error executing in sandbox: %v", err)
+ }
+
+ return syscall.WaitStatus(waitStatus), nil
+}
+
+// Event retrieves stats about the sandbox such as memory and CPU utilization.
+func (s *Sandbox) Event() (*boot.Event, error) {
+ if s.Status != Running && s.Status != Created {
+ return nil, fmt.Errorf("cannot get events for container in state: %s", s.Status)
+ }
+
+ c, err := s.connect()
+ if err != nil {
+ return nil, err
+ }
+ defer c.Close()
+
+ var e boot.Event
+ if err := c.Call(boot.ApplicationEvent, nil, &e); err != nil {
+ return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err)
+ }
+ e.ID = s.ID
+ return &e, nil
+}
+
+func (s *Sandbox) connect() (*urpc.Client, error) {
+ log.Debugf("Connecting to sandbox...")
+ c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
+ if err != nil {
+ return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+ }
+ return c, nil
+}
+
+func (s *Sandbox) createProcesses(conf *boot.Config, args []string) error {
+ binPath, err := specutils.BinPath()
+ if err != nil {
+ return err
+ }
+
+ ioFiles, err := s.createGoferProcess(conf, binPath, args)
+ if err != nil {
+ return err
+ }
+ return s.createSandboxProcess(conf, binPath, args, ioFiles)
+}
+
+func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonArgs []string) ([]*os.File, error) {
+ if conf.FileAccess != boot.FileAccessProxy {
+ // Don't start a gofer. The sandbox will access host FS directly.
+ return nil, nil
+ }
+
+ var args []string
+ args = append(args, commonArgs...)
+ args = append(args, "gofer", "--bundle", s.BundleDir)
+
+ // Start with root mount and then add any other additional mount.
+ mountCount := 1
+ for _, m := range s.Spec.Mounts {
+ if specutils.Is9PMount(m) {
+ mountCount++
+ }
+ }
+
+ sandEnds := make([]*os.File, 0, mountCount)
+ goferEnds := make([]*os.File, 0, mountCount)
+ for i := 0; i < mountCount; i++ {
+ // Create socket that connects the sandbox and gofer.
+ fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+ if err != nil {
+ return nil, err
+ }
+ sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd"))
+
+ goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd")
+ defer goferEnd.Close()
+ goferEnds = append(goferEnds, goferEnd)
+
+ args = append(args, fmt.Sprintf("--io-fds=%d", 3+i))
+ }
+
+ cmd := exec.Command(binPath, args...)
+ cmd.ExtraFiles = goferEnds
+
+ // Setup any uid/gid mappings, and create or join the configured user
+ // namespace so the gofer's view of the filesystem aligns with the
+ // users in the sandbox.
+ setUIDGIDMappings(cmd, s.Spec)
+ nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, s.Spec)
+
+ // Start the gofer in the given namespace.
+ log.Debugf("Starting gofer: %s %v", binPath, args)
+ if err := startInNS(cmd, nss); err != nil {
+ return nil, err
+ }
+ s.GoferPid = cmd.Process.Pid
+ log.Infof("Gofer started, pid: %d", cmd.Process.Pid)
+ return sandEnds, nil
+}
+
+// createSandboxProcess starts the sandbox as a subprocess by running the "boot"
+// command, passing in the bundle dir.
+func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, commonArgs []string, ioFiles []*os.File) error {
+ // nextFD is used to get unused FDs that we can pass to the sandbox. It
+ // starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
+ nextFD := 3
+
+ // Create control server socket here and donate FD to child process because
+ // it may be in a different network namespace and won't be reachable from
+ // outside.
+ fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID))
+ if err != nil {
+ return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
+ }
+
+ consoleEnabled := s.ConsoleSocket != ""
+
+ cmd := exec.Command(binPath, commonArgs...)
+ cmd.SysProcAttr = &syscall.SysProcAttr{}
+ cmd.Args = append(cmd.Args,
+ "boot",
+ "--bundle", s.BundleDir,
+ "--controller-fd="+strconv.Itoa(nextFD),
+ fmt.Sprintf("--console=%t", consoleEnabled))
+ nextFD++
+
+ controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
+ defer controllerFile.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
+
+ // If there is a gofer, sends all socket ends to the sandbox.
+ for _, f := range ioFiles {
+ defer f.Close()
+ cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+ cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ // If the console control socket file is provided, then create a new
+ // pty master/slave pair and set the tty on the sandox process.
+ if consoleEnabled {
+ // setupConsole will send the master on the socket, and return
+ // the slave.
+ tty, err := setupConsole(s.ConsoleSocket)
+ if err != nil {
+ return fmt.Errorf("error setting up control socket %q: %v", s.ConsoleSocket, err)
+ }
+ defer tty.Close()
+
+ cmd.Stdin = tty
+ cmd.Stdout = tty
+ cmd.Stderr = tty
+ cmd.SysProcAttr.Setctty = true
+ cmd.SysProcAttr.Ctty = int(tty.Fd())
+ } else {
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ }
+
+ // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
+ // when re-parented.
+ cmd.SysProcAttr.Setsid = true
+
+ // nss is the set of namespaces to join or create before starting the sandbox
+ // process. IPC and UTS namespaces from the host are not used as they
+ // are virtualized inside the sandbox. Be paranoid and run inside an empty
+ // namespace for these.
+ log.Infof("Sandbox will be started in empty IPC and UTS namespaces")
+ nss := []specs.LinuxNamespace{
+ specs.LinuxNamespace{Type: specs.IPCNamespace},
+ specs.LinuxNamespace{Type: specs.UTSNamespace},
+ }
+
+ if conf.Platform == boot.PlatformPtrace {
+ // TODO: Also set an empty PID namespace so that we limit
+ // access to other host processes.
+ log.Infof("Sandbox will be started in the current PID namespace")
+ } else {
+ log.Infof("Sandbox will be started in empty PID namespace")
+ nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
+ }
+
+ if conf.FileAccess == boot.FileAccessProxy {
+ log.Infof("Sandbox will be started in empty mount namespace")
+ nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
+ } else {
+ log.Infof("Sandbox will be started in the current mount namespace")
+ }
+
+ // Joins the network namespace if network is enabled. the sandbox talks
+ // directly to the host network, which may have been configured in the
+ // namespace.
+ if ns, ok := getNS(specs.NetworkNamespace, s.Spec); ok && conf.Network != boot.NetworkNone {
+ log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
+ nss = append(nss, ns)
+ } else {
+ log.Infof("Sandbox will be started in empty network namespace")
+ nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
+ }
+
+ // User namespace depends on the following options:
+ // - Host network/filesystem: requires to run inside the user namespace
+ // specified in the spec or the current namespace if none is configured.
+ // - Gofer: when using a Gofer, the sandbox process can run isolated in an
+ // empty namespace.
+ if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect {
+ if userns, ok := getNS(specs.UserNamespace, s.Spec); ok {
+ log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
+ nss = append(nss, userns)
+ setUIDGIDMappings(cmd, s.Spec)
+ } else {
+ // TODO: Retrict capabilities since it's using current user
+ // namespace, i.e. root.
+ log.Infof("Sandbox will be started in the current user namespace")
+ }
+ // When running in the caller's defined user namespace, apply the same
+ // capabilities to the sandbox process to ensure it abides to the same
+ // rules.
+ cmd.Args = append(cmd.Args, "--apply-caps=true")
+
+ } else {
+ log.Infof("Sandbox will be started in empty user namespace")
+ nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+ }
+
+ log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
+ if err := startInNS(cmd, nss); err != nil {
+ return err
+ }
+ s.Pid = cmd.Process.Pid
+ log.Infof("Sandbox started, pid: %d", s.Pid)
+ return nil
+}
+
+// waitForCreated waits for the sandbox subprocess control server to be
+// running, at which point the sandbox is in Created state.
+func (s *Sandbox) waitForCreated(timeout time.Duration) error {
+ log.Debugf("Waiting for sandbox %q creation", s.ID)
+ tchan := time.After(timeout)
+ for {
+ select {
+ case <-tchan:
+ return fmt.Errorf("timed out waiting for sandbox control server")
+ default:
+ if c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)); err == nil {
+ // It's alive!
+ c.Close()
+ return nil
+ }
+ }
+ }
+}
+
+// Wait waits for the containerized process to exit, and returns its WaitStatus.
+func (s *Sandbox) Wait() (syscall.WaitStatus, error) {
+ log.Debugf("Wait on sandbox %q with pid %d", s.ID, s.Pid)
+ p, err := os.FindProcess(s.Pid)
+ if err != nil {
+ // "On Unix systems, FindProcess always succeeds and returns a
+ // Process for the given pid."
+ panic(err)
+ }
+ ps, err := p.Wait()
+ if err != nil {
+ return 0, err
+ }
+ return ps.Sys().(syscall.WaitStatus), nil
+}
+
+// Destroy frees all resources associated with the sandbox.
+func (s *Sandbox) Destroy() error {
+ log.Debugf("Destroy sandbox %q", s.ID)
+ if s.Pid != 0 {
+ // TODO: Too harsh?
+ log.Debugf("Killing sandbox %q", s.ID)
+ sendSignal(s.Pid, unix.SIGKILL)
+ s.Pid = 0
+ }
+ if s.GoferPid != 0 {
+ log.Debugf("Killing gofer for sandbox %q", s.ID)
+ sendSignal(s.GoferPid, unix.SIGKILL)
+ s.GoferPid = 0
+ }
+ if err := os.RemoveAll(s.SandboxRoot); err != nil {
+ log.Warningf("Failed to delete sandbox root directory %q, err: %v", s.SandboxRoot, err)
+ }
+
+ // "If any poststop hook fails, the runtime MUST log a warning, but the
+ // remaining hooks and lifecycle continue as if the hook had succeeded".
+ if s.Spec.Hooks != nil && (s.Status == Created || s.Status == Running) {
+ executeHooksBestEffort(s.Spec.Hooks.Poststop, s.State())
+ }
+
+ s.Status = Stopped
+ return nil
+}
+
+// Signal sends the signal to the sandbox.
+func (s *Sandbox) Signal(sig syscall.Signal) error {
+ log.Debugf("Signal sandbox %q", s.ID)
+ if s.Status == Stopped {
+ log.Warningf("sandbox %q not running, not sending signal %v to pid %d", s.ID, sig, s.Pid)
+ return nil
+ }
+ return sendSignal(s.Pid, sig)
+}
+
+func sendSignal(pid int, sig syscall.Signal) error {
+ if err := syscall.Kill(pid, sig); err != nil {
+ return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err)
+ }
+ return nil
+}
+
+// save saves the sandbox metadata to a file.
+func (s *Sandbox) save() error {
+ log.Debugf("Save sandbox %q", s.ID)
+ if err := os.MkdirAll(s.SandboxRoot, 0711); err != nil {
+ return fmt.Errorf("error creating sandbox root directory %q: %v", s.SandboxRoot, err)
+ }
+ meta, err := json.Marshal(s)
+ if err != nil {
+ return fmt.Errorf("error marshaling sandbox metadata: %v", err)
+ }
+ metaFile := filepath.Join(s.SandboxRoot, metadataFilename)
+ if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
+ return fmt.Errorf("error writing sandbox metadata: %v", err)
+ }
+ return nil
+}
+
+// exists returns true if the given file exists.
+func exists(f string) bool {
+ if _, err := os.Stat(f); err == nil {
+ return true
+ } else if !os.IsNotExist(err) {
+ log.Warningf("error checking for file %q: %v", f, err)
+ }
+ return false
+}
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
new file mode 100644
index 000000000..6c71cac30
--- /dev/null
+++ b/runsc/sandbox/sandbox_test.go
@@ -0,0 +1,649 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox_test
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "os/signal"
+ "path/filepath"
+ "reflect"
+ "strings"
+ "syscall"
+ "testing"
+ "time"
+
+ "context"
+ "flag"
+ "github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "golang.org/x/sys/unix"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/unet"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/cmd"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+func init() {
+ log.SetLevel(log.Debug)
+}
+
+// writeSpec writes the spec to disk in the given directory.
+func writeSpec(dir string, spec *specs.Spec) error {
+ b, err := json.Marshal(spec)
+ if err != nil {
+ return err
+ }
+ return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
+}
+
+// newSpecWithArgs creates a simple spec with the given args suitable for use
+// in tests.
+func newSpecWithArgs(args ...string) *specs.Spec {
+ spec := &specs.Spec{
+ // The host filesystem root is the sandbox root.
+ Root: &specs.Root{
+ Path: "/",
+ Readonly: true,
+ },
+ Process: &specs.Process{
+ Args: args,
+ Env: []string{
+ "PATH=" + os.Getenv("PATH"),
+ },
+ },
+ }
+ return spec
+}
+
+// shutdownSignal will be sent to the sandbox in order to shut down cleanly.
+const shutdownSignal = syscall.SIGUSR2
+
+// setupSandbox creates a bundle and root dir for the sandbox, generates a test
+// config, and writes the spec to config.json in the bundle dir.
+func setupSandbox(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
+ rootDir, err = ioutil.TempDir("", "sandboxes")
+ if err != nil {
+ return "", "", nil, fmt.Errorf("error creating root dir: %v", err)
+ }
+
+ bundleDir, err = ioutil.TempDir("", "bundle")
+ if err != nil {
+ return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+ }
+
+ if err = writeSpec(bundleDir, spec); err != nil {
+ return "", "", nil, fmt.Errorf("error writing spec: %v", err)
+ }
+
+ conf = &boot.Config{
+ RootDir: rootDir,
+ Network: boot.NetworkNone,
+ }
+
+ return rootDir, bundleDir, conf, nil
+}
+
+// uniqueSandboxID generates a unique sandbox id for each test.
+//
+// The sandbox id is used to create an abstract unix domain socket, which must
+// be unique. While the sandbox forbids creating two sandboxes with the same
+// name, sometimes between test runs the socket does not get cleaned up quickly
+// enough, causing sandbox creation to fail.
+func uniqueSandboxID() string {
+ return fmt.Sprintf("test-sandbox-%d", time.Now().UnixNano())
+}
+
+// waitForProcessList waits for the given process list to show up in the sandbox.
+func waitForProcessList(s *sandbox.Sandbox, expected []*control.Process) error {
+ var got []*control.Process
+ for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
+ var err error
+ got, err := s.Processes()
+ if err != nil {
+ return fmt.Errorf("error getting process data from sandbox: %v", err)
+ }
+ if procListsEqual(got, expected) {
+ return nil
+ }
+ // Process might not have started, try again...
+ time.Sleep(10 * time.Millisecond)
+ }
+ return fmt.Errorf("sandbox got process list: %s, want: %s", procListToString(got), procListToString(expected))
+}
+
+// TestLifecycle tests the basic Create/Start/Signal/Destory sandbox lifecycle.
+// It verifies after each step that the sandbox can be loaded from disk, and
+// has the correct status.
+func TestLifecycle(t *testing.T) {
+ // The sandbox will just sleep for a long time. We will kill it before
+ // it finishes sleeping.
+ spec := newSpecWithArgs("sleep", "100")
+
+ rootDir, bundleDir, conf, err := setupSandbox(spec)
+ if err != nil {
+ t.Fatalf("error setting up sandbox: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ defer os.RemoveAll(bundleDir)
+
+ // expectedPL lists the expected process state of the sandbox.
+ expectedPL := []*control.Process{
+ {
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ },
+ }
+ // Create the sandbox.
+ id := uniqueSandboxID()
+ if _, err := sandbox.Create(id, spec, conf, bundleDir, "", "", nil); err != nil {
+ t.Fatalf("error creating sandbox: %v", err)
+ }
+ // Load the sandbox from disk and check the status.
+ s, err := sandbox.Load(rootDir, id)
+ if err != nil {
+ t.Fatalf("error loading sandbox: %v", err)
+ }
+ if got, want := s.Status, sandbox.Created; got != want {
+ t.Errorf("sandbox status got %v, want %v", got, want)
+ }
+
+ // List should return the sandbox id.
+ ids, err := sandbox.List(rootDir)
+ if err != nil {
+ t.Fatalf("error listing sandboxes: %v", err)
+ }
+ if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
+ t.Errorf("sandbox list got %v, want %v", got, want)
+ }
+
+ // Start the sandbox.
+ if err := s.Start(conf); err != nil {
+ t.Fatalf("error starting sandbox: %v", err)
+ }
+ // Load the sandbox from disk and check the status.
+ s, err = sandbox.Load(rootDir, id)
+ if err != nil {
+ t.Fatalf("error loading sandbox: %v", err)
+ }
+ if got, want := s.Status, sandbox.Running; got != want {
+ t.Errorf("sandbox status got %v, want %v", got, want)
+ }
+
+ // Verify that "sleep 100" is running.
+ if err := waitForProcessList(s, expectedPL); err != nil {
+ t.Error(err)
+ }
+
+ // Send the sandbox a signal, which we catch and use to cleanly
+ // shutdown.
+ if err := s.Signal(shutdownSignal); err != nil {
+ t.Fatalf("error sending signal %v to sandbox: %v", shutdownSignal, err)
+ }
+ // Wait for it to die.
+ if _, err := s.Wait(); err != nil {
+ t.Fatalf("error waiting on sandbox: %v", err)
+ }
+ // Load the sandbox from disk and check the status.
+ s, err = sandbox.Load(rootDir, id)
+ if err != nil {
+ t.Fatalf("error loading sandbox: %v", err)
+ }
+ if got, want := s.Status, sandbox.Stopped; got != want {
+ t.Errorf("sandbox status got %v, want %v", got, want)
+ }
+
+ // Destroy the sandbox.
+ if err := s.Destroy(); err != nil {
+ t.Fatalf("error destroying sandbox: %v", err)
+ }
+
+ // List should not return the sandbox id.
+ ids, err = sandbox.List(rootDir)
+ if err != nil {
+ t.Fatalf("error listing sandboxes: %v", err)
+ }
+ if len(ids) != 0 {
+ t.Errorf("expected sandbox list to be empty, but got %v", ids)
+ }
+
+ // Loading the sandbox by id should fail.
+ if _, err = sandbox.Load(rootDir, id); err == nil {
+ t.Errorf("expected loading destroyed sandbox to fail, but it did not")
+ }
+}
+
+// Test the we can execute the application with different path formats.
+func TestExePath(t *testing.T) {
+ for _, test := range []struct {
+ path string
+ success bool
+ }{
+ {path: "true", success: true},
+ {path: "bin/true", success: true},
+ {path: "/bin/true", success: true},
+ {path: "thisfiledoesntexit", success: false},
+ {path: "bin/thisfiledoesntexit", success: false},
+ {path: "/bin/thisfiledoesntexit", success: false},
+ } {
+ spec := newSpecWithArgs(test.path)
+ rootDir, bundleDir, conf, err := setupSandbox(spec)
+ if err != nil {
+ t.Fatalf("exec: %s, error setting up sandbox: %v", test.path, err)
+ }
+
+ ws, err := sandbox.Run(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+
+ os.RemoveAll(rootDir)
+ os.RemoveAll(bundleDir)
+
+ if test.success {
+ if err != nil {
+ t.Errorf("exec: %s, error running sandbox: %v", test.path, err)
+ }
+ if ws.ExitStatus() != 0 {
+ t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
+ }
+ } else {
+ if err == nil {
+ t.Errorf("exec: %s, got: no error, want: error", test.path)
+ }
+ }
+ }
+}
+
+// Test the we can retrieve the application exit status from the sandbox.
+func TestAppExitStatus(t *testing.T) {
+ // First sandbox will succeed.
+ succSpec := newSpecWithArgs("true")
+
+ rootDir, bundleDir, conf, err := setupSandbox(succSpec)
+ if err != nil {
+ t.Fatalf("error setting up sandbox: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ defer os.RemoveAll(bundleDir)
+
+ ws, err := sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir, "", "", nil)
+ if err != nil {
+ t.Fatalf("error running sandbox: %v", err)
+ }
+ if ws.ExitStatus() != 0 {
+ t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0)
+ }
+
+ // Second sandbox exits with non-zero status.
+ wantStatus := 123
+ errSpec := newSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
+
+ rootDir2, bundleDir2, conf, err := setupSandbox(errSpec)
+ if err != nil {
+ t.Fatalf("error setting up sandbox: %v", err)
+ }
+ defer os.RemoveAll(rootDir2)
+ defer os.RemoveAll(bundleDir2)
+
+ ws, err = sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir2, "", "", nil)
+ if err != nil {
+ t.Fatalf("error running sandbox: %v", err)
+ }
+ if ws.ExitStatus() != wantStatus {
+ t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus)
+ }
+}
+
+// TestExec verifies that a sandbox can exec a new program.
+func TestExec(t *testing.T) {
+ const uid = 343
+ spec := newSpecWithArgs("sleep", "100")
+
+ rootDir, bundleDir, conf, err := setupSandbox(spec)
+ if err != nil {
+ t.Fatalf("error setting up sandbox: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ defer os.RemoveAll(bundleDir)
+
+ // Create and start the sandbox.
+ s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+ if err != nil {
+ t.Fatalf("error creating sandbox: %v", err)
+ }
+ defer s.Destroy()
+ if err := s.Start(conf); err != nil {
+ t.Fatalf("error starting sandbox: %v", err)
+ }
+
+ // expectedPL lists the expected process state of the sandbox.
+ expectedPL := []*control.Process{
+ {
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ },
+ {
+ UID: uid,
+ PID: 2,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ },
+ }
+
+ // Verify that "sleep 100" is running.
+ if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+ t.Error(err)
+ }
+
+ execArgs := control.ExecArgs{
+ Filename: "/bin/sleep",
+ Argv: []string{"sleep", "5"},
+ Envv: []string{"PATH=" + os.Getenv("PATH")},
+ WorkingDirectory: "/",
+ KUID: uid,
+ Detach: false,
+ }
+
+ // Verify that "sleep 100" and "sleep 5" are running after exec.
+ // First, start running exec (whick blocks).
+ status := make(chan error, 1)
+ go func() {
+ exitStatus, err := s.Execute(&execArgs)
+ if err != nil {
+ status <- err
+ } else if exitStatus != 0 {
+ status <- fmt.Errorf("failed with exit status: %v", exitStatus)
+ } else {
+ status <- nil
+ }
+ }()
+
+ if err := waitForProcessList(s, expectedPL); err != nil {
+ t.Fatal(err)
+ }
+
+ // Ensure that exec finished without error.
+ select {
+ case <-time.After(10 * time.Second):
+ t.Fatalf("sandbox timed out waiting for exec to finish.")
+ case st := <-status:
+ if st != nil {
+ t.Errorf("sandbox failed to exec %v: %v", execArgs, err)
+ }
+ }
+}
+
+// TestCapabilities verifies that:
+// - Running exec as non-root UID and GID will result in an error (because the
+// executable file can't be read).
+// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips
+// this check.
+func TestCapabilities(t *testing.T) {
+ const uid = 343
+ const gid = 2401
+ spec := newSpecWithArgs("sleep", "100")
+
+ // We generate files in the host temporary directory.
+ spec.Mounts = append(spec.Mounts, specs.Mount{
+ Destination: os.TempDir(),
+ Source: os.TempDir(),
+ Type: "bind",
+ })
+
+ rootDir, bundleDir, conf, err := setupSandbox(spec)
+ if err != nil {
+ t.Fatalf("error setting up sandbox: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ defer os.RemoveAll(bundleDir)
+
+ // Create and start the sandbox.
+ s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+ if err != nil {
+ t.Fatalf("error creating sandbox: %v", err)
+ }
+ defer s.Destroy()
+ if err := s.Start(conf); err != nil {
+ t.Fatalf("error starting sandbox: %v", err)
+ }
+
+ // expectedPL lists the expected process state of the sandbox.
+ expectedPL := []*control.Process{
+ {
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ },
+ {
+ UID: uid,
+ PID: 2,
+ PPID: 0,
+ C: 0,
+ Cmd: "exe",
+ },
+ }
+ if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+ t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+ }
+
+ // Create an executable that can't be run with the specified UID:GID.
+ // This shouldn't be callable within the sandbox until we add the
+ // CAP_DAC_OVERRIDE capability to skip the access check.
+ exePath := filepath.Join(rootDir, "exe")
+ if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+ t.Fatalf("couldn't create executable: %v", err)
+ }
+ defer os.Remove(exePath)
+
+ // Need to traverse the intermediate directory.
+ os.Chmod(rootDir, 0755)
+
+ execArgs := control.ExecArgs{
+ Filename: exePath,
+ Argv: []string{exePath},
+ Envv: []string{"PATH=" + os.Getenv("PATH")},
+ WorkingDirectory: "/",
+ KUID: uid,
+ KGID: gid,
+ Capabilities: &auth.TaskCapabilities{},
+ Detach: true,
+ }
+
+ // "exe" should fail because we don't have the necessary permissions.
+ if _, err := s.Execute(&execArgs); err == nil {
+ t.Fatalf("sandbox executed without error, but an error was expected")
+ }
+
+ // Now we run with the capability enabled and should succeed.
+ execArgs.Capabilities = &auth.TaskCapabilities{
+ EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+ }
+ // First, start running exec.
+ if _, err := s.Execute(&execArgs); err != nil {
+ t.Fatalf("sandbox failed to exec %v: %v", execArgs, err)
+ }
+
+ if err := waitForProcessList(s, expectedPL); err != nil {
+ t.Error(err)
+ }
+}
+
+// Test that an tty FD is sent over the console socket if one is provided.
+func TestConsoleSocket(t *testing.T) {
+ spec := newSpecWithArgs("true")
+ rootDir, bundleDir, conf, err := setupSandbox(spec)
+ if err != nil {
+ t.Fatalf("error setting up sandbox: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ defer os.RemoveAll(bundleDir)
+
+ // Create a named socket and start listening. We use a relative path
+ // to avoid overflowing the unix path length limit (108 chars).
+ socketPath := filepath.Join(bundleDir, "socket")
+ cwd, err := os.Getwd()
+ if err != nil {
+ t.Fatalf("error getting cwd: %v", err)
+ }
+ socketRelPath, err := filepath.Rel(cwd, socketPath)
+ if err != nil {
+ t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+ }
+ if len(socketRelPath) > len(socketPath) {
+ socketRelPath = socketPath
+ }
+ srv, err := unet.BindAndListen(socketRelPath, false)
+ if err != nil {
+ t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
+ }
+ defer os.Remove(socketPath)
+
+ // Create the sandbox and pass the socket name.
+ id := uniqueSandboxID()
+ s, err := sandbox.Create(id, spec, conf, bundleDir, socketRelPath, "", nil)
+ if err != nil {
+ t.Fatalf("error creating sandbox: %v", err)
+ }
+
+ // Open the othe end of the socket.
+ sock, err := srv.Accept()
+ if err != nil {
+ t.Fatalf("error accepting socket connection: %v", err)
+ }
+
+ // Allow 3 fds to be received. We only expect 1.
+ r := sock.Reader(true /* blocking */)
+ r.EnableFDs(1)
+
+ // The socket is closed right after sending the FD, so EOF is
+ // an allowed error.
+ b := [][]byte{{}}
+ if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+ t.Fatalf("error reading from socket connection: %v", err)
+ }
+
+ // We should have gotten a control message.
+ fds, err := r.ExtractFDs()
+ if err != nil {
+ t.Fatalf("error extracting fds from socket connection: %v", err)
+ }
+ if len(fds) != 1 {
+ t.Fatalf("got %d fds from socket, wanted 1", len(fds))
+ }
+
+ // Verify that the fd is a terminal.
+ if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+ t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+ }
+
+ // Shut it down.
+ if err := s.Destroy(); err != nil {
+ t.Fatalf("error destroying sandbox: %v", err)
+ }
+
+ // Close socket.
+ if err := srv.Close(); err != nil {
+ t.Fatalf("error destroying sandbox: %v", err)
+ }
+}
+
+// procListsEqual is used to check whether 2 Process lists are equal for all
+// implemented fields.
+func procListsEqual(got, want []*control.Process) bool {
+ if len(got) != len(want) {
+ return false
+ }
+ for i := range got {
+ pd1 := got[i]
+ pd2 := want[i]
+ // Zero out unimplemented and timing dependant fields.
+ pd1.Time, pd2.Time = "", ""
+ pd1.STime, pd2.STime = "", ""
+ pd1.C, pd2.C = 0, 0
+ if *pd1 != *pd2 {
+ return false
+ }
+ }
+ return true
+}
+
+func procListToString(pl []*control.Process) string {
+ strs := make([]string, 0, len(pl))
+ for _, p := range pl {
+ strs = append(strs, fmt.Sprintf("%+v", p))
+ }
+ return fmt.Sprintf("[%s]", strings.Join(strs, ","))
+}
+
+// TestMain acts like runsc if it is called with the "boot" argument, otherwise
+// it just runs the tests. This is required because creating a sandbox will
+// call "/proc/self/exe boot". Normally /proc/self/exe is the runsc binary,
+// but for tests we have to fake it.
+func TestMain(m *testing.M) {
+ // exit writes coverage data before exiting.
+ exit := func(status int) {
+ os.Exit(status)
+ }
+
+ if !flag.Parsed() {
+ flag.Parse()
+ }
+
+ // If we are passed one of the commands then run it.
+ subcommands.Register(new(cmd.Boot), "boot")
+ subcommands.Register(new(cmd.Gofer), "gofer")
+ switch flag.Arg(0) {
+ case "boot", "gofer":
+ // Run the command in a goroutine so we can block the main
+ // thread waiting for shutdownSignal.
+ go func() {
+ conf := &boot.Config{
+ RootDir: "unused-root-dir",
+ Network: boot.NetworkNone,
+ }
+ var ws syscall.WaitStatus
+ subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+ if subcmdCode != subcommands.ExitSuccess {
+ panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode))
+ }
+ // Sandbox exited normally. Shut down this process.
+ os.Exit(ws.ExitStatus())
+ }()
+
+ // Shutdown cleanly when the shutdownSignal is received. This
+ // allows us to write coverage data before exiting.
+ sigc := make(chan os.Signal, 1)
+ signal.Notify(sigc, shutdownSignal)
+ <-sigc
+ exit(0)
+ default:
+ // Otherwise run the tests.
+ exit(m.Run())
+ }
+}
diff --git a/runsc/sandbox/status.go b/runsc/sandbox/status.go
new file mode 100644
index 000000000..6fc936aba
--- /dev/null
+++ b/runsc/sandbox/status.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+// Status enumerates sandbox statuses. The statuses and their semantics are
+// part of the runtime CLI spec.
+//
+// TODO: Get precise about the transitions between statuses.
+type Status int
+
+const (
+ // Creating indicates "the container is being created".
+ Creating Status = iota
+
+ // Created indicates "the runtime has finished the create operation and
+ // the container process has neither exited nor executed the
+ // user-specified program".
+ Created
+
+ // Running indicates "the container process has executed the
+ // user-specified program but has not exited".
+ Running
+
+ // Stopped indicates "the container process has exited".
+ Stopped
+)
+
+// String converts a Status to a string. These strings are part of the runtime
+// CLI spec and should not be changed.
+func (s Status) String() string {
+ switch s {
+ case Creating:
+ return "creating"
+ case Created:
+ return "created"
+ case Running:
+ return "running"
+ case Stopped:
+ return "stopped"
+ default:
+ return "unknown"
+ }
+
+}
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
new file mode 100644
index 000000000..ae89260d2
--- /dev/null
+++ b/runsc/specutils/BUILD
@@ -0,0 +1,18 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "specutils",
+ srcs = ["specutils.go"],
+ importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
+ visibility = [
+ "//runsc:__subpackages__",
+ ],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/log",
+ "//pkg/sentry/kernel/auth",
+ "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ ],
+)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
new file mode 100644
index 000000000..bed0f75eb
--- /dev/null
+++ b/runsc/specutils/specutils.go
@@ -0,0 +1,183 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package specutils contains utility functions for working with OCI runtime
+// specs.
+package specutils
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strings"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// LogSpec logs the spec in a human-friendly way.
+func LogSpec(spec *specs.Spec) {
+ log.Debugf("Spec: %+v", spec)
+ log.Debugf("Spec.Hooks: %+v", spec.Hooks)
+ log.Debugf("Spec.Linux: %+v", spec.Linux)
+ log.Debugf("Spec.Process: %+v", spec.Process)
+ log.Debugf("Spec.Root: %+v", spec.Root)
+}
+
+// ReadSpec reads an OCI runtime spec from the given bundle directory.
+//
+// TODO: This should validate the spec.
+func ReadSpec(bundleDir string) (*specs.Spec, error) {
+ // The spec file must be in "config.json" inside the bundle directory.
+ specFile := filepath.Join(bundleDir, "config.json")
+ specBytes, err := ioutil.ReadFile(specFile)
+ if err != nil {
+ return nil, fmt.Errorf("error reading spec from file %q: %v", specFile, err)
+ }
+ var spec specs.Spec
+ if err := json.Unmarshal(specBytes, &spec); err != nil {
+ return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile, err, string(specBytes))
+ }
+ return &spec, nil
+}
+
+// GetExecutablePath returns the absolute path to the executable, relative to
+// the root. It searches the environment PATH for the first file that exists
+// with the given name.
+func GetExecutablePath(exec, root string, env []string) (string, error) {
+ exec = filepath.Clean(exec)
+
+ // Don't search PATH if exec is a path to a file (absolute or relative).
+ if strings.IndexByte(exec, '/') >= 0 {
+ return exec, nil
+ }
+
+ // Get the PATH from the environment.
+ const prefix = "PATH="
+ var path []string
+ for _, e := range env {
+ if strings.HasPrefix(e, prefix) {
+ path = strings.Split(strings.TrimPrefix(e, prefix), ":")
+ break
+ }
+ }
+
+ // Search the PATH for a file whose name matches the one we are looking
+ // for.
+ for _, p := range path {
+ abs := filepath.Join(root, p, exec)
+ if _, err := os.Stat(abs); err == nil {
+ // We found it! Return the path relative to the root.
+ return filepath.Join("/", p, exec), nil
+ }
+ }
+
+ // Could not find a suitable path, just return the original string.
+ log.Warningf("could not find executable %s in path %s", exec, path)
+ return exec, nil
+}
+
+// Capabilities takes in spec and returns a TaskCapabilities corresponding to
+// the spec.
+func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+ var caps auth.TaskCapabilities
+ if specCaps != nil {
+ var err error
+ if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding); err != nil {
+ return nil, err
+ }
+ if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective); err != nil {
+ return nil, err
+ }
+ if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable); err != nil {
+ return nil, err
+ }
+ if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted); err != nil {
+ return nil, err
+ }
+ // TODO: Support ambient capabilities.
+ }
+ return &caps, nil
+}
+
+var capFromName = map[string]linux.Capability{
+ "CAP_CHOWN": linux.CAP_CHOWN,
+ "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE,
+ "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH,
+ "CAP_FOWNER": linux.CAP_FOWNER,
+ "CAP_FSETID": linux.CAP_FSETID,
+ "CAP_KILL": linux.CAP_KILL,
+ "CAP_SETGID": linux.CAP_SETGID,
+ "CAP_SETUID": linux.CAP_SETUID,
+ "CAP_SETPCAP": linux.CAP_SETPCAP,
+ "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE,
+ "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE,
+ "CAP_NET_BROAD_CAST": linux.CAP_NET_BROAD_CAST,
+ "CAP_NET_ADMIN": linux.CAP_NET_ADMIN,
+ "CAP_NET_RAW": linux.CAP_NET_RAW,
+ "CAP_IPC_LOCK": linux.CAP_IPC_LOCK,
+ "CAP_IPC_OWNER": linux.CAP_IPC_OWNER,
+ "CAP_SYS_MODULE": linux.CAP_SYS_MODULE,
+ "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO,
+ "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT,
+ "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE,
+ "CAP_SYS_PACCT": linux.CAP_SYS_PACCT,
+ "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN,
+ "CAP_SYS_BOOT": linux.CAP_SYS_BOOT,
+ "CAP_SYS_NICE": linux.CAP_SYS_NICE,
+ "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE,
+ "CAP_SYS_TIME": linux.CAP_SYS_TIME,
+ "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG,
+ "CAP_MKNOD": linux.CAP_MKNOD,
+ "CAP_LEASE": linux.CAP_LEASE,
+ "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE,
+ "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL,
+ "CAP_SETFCAP": linux.CAP_SETFCAP,
+ "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE,
+ "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN,
+ "CAP_SYSLOG": linux.CAP_SYSLOG,
+ "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM,
+ "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND,
+}
+
+func capsFromNames(names []string) (auth.CapabilitySet, error) {
+ var caps []linux.Capability
+ for _, n := range names {
+ c, ok := capFromName[n]
+ if !ok {
+ return 0, fmt.Errorf("unknown capability %q", n)
+ }
+ caps = append(caps, c)
+ }
+ return auth.CapabilitySetOfMany(caps), nil
+}
+
+// Is9PMount returns true if the given mount can be mounted as an external gofer.
+func Is9PMount(m specs.Mount) bool {
+ return m.Type == "bind" && m.Source != "" && !strings.HasPrefix(m.Destination, "/dev")
+}
+
+// BinPath returns the real path to self, resolving symbolink links. This is done
+// to make the process name appears as 'runsc', instead of 'exe'.
+func BinPath() (string, error) {
+ binPath, err := filepath.EvalSymlinks("/proc/self/exe")
+ if err != nil {
+ return "", fmt.Errorf(`error resolving "/proc/self/exe" symlink: %v`, err)
+ }
+ return binPath, nil
+}