Check in gVisor.

PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
author: Googler <noreply@google.com> 2018-04-27 10:37:02 -0700
committer: Adin Scannell <ascannell@google.com> 2018-04-28 01:44:26 -0400
commit: d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree: 54f95eef73aee6bacbfc736fffc631be2605ed53 /runsc
parent: f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
50 files changed, 8348 insertions, 0 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
new file mode 100644
index 000000000..3651c2d30
--- /dev/null
+++ b/runsc/BUILD
@@ -0,0 +1,17 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
+go_binary(
+    name = "runsc",
+    srcs = [
+        "main.go",
+    ],
+    pure = "on",
+    deps = [
+        "//pkg/log",
+        "//runsc/boot",
+        "//runsc/cmd",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
+)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
new file mode 100644
index 000000000..88736cfa4
--- /dev/null
+++ b/runsc/boot/BUILD
@@ -0,0 +1,88 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "boot",
+    srcs = [
+        "capability.go",
+        "config.go",
+        "controller.go",
+        "events.go",
+        "fds.go",
+        "fs.go",
+        "limits.go",
+        "loader.go",
+        "network.go",
+        "strace.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/control/server",
+        "//pkg/cpuid",
+        "//pkg/log",
+        "//pkg/sentry/context",
+        "//pkg/sentry/control",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/dev",
+        "//pkg/sentry/fs/gofer",
+        "//pkg/sentry/fs/host",
+        "//pkg/sentry/fs/proc",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fs/sys",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/kdefs",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+        "//pkg/sentry/sighandling",
+        "//pkg/sentry/socket/epsocket",
+        "//pkg/sentry/socket/hostinet",
+        "//pkg/sentry/socket/netlink",
+        "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/strace",
+        "//pkg/sentry/syscalls/linux",
+        "//pkg/sentry/time",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/watchdog",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/arp",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/urpc",
+        "//runsc/boot/filter",
+        "//runsc/specutils",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
+    ],
+)
+
+go_test(
+    name = "boot_test",
+    size = "small",
+    srcs = ["loader_test.go"],
+    embed = [":boot"],
+    deps = [
+        "//pkg/control/server",
+        "//pkg/log",
+        "//pkg/sentry/context/contexttest",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/boot/capability.go b/runsc/boot/capability.go
new file mode 100644
index 000000000..4c6a59245
--- /dev/null
+++ b/runsc/boot/capability.go
@@ -0,0 +1,120 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"os"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
+)
+
+// ApplyCaps applies the capabilities in the spec to the current thread.
+//
+// Note that it must be called with current thread locked.
+func ApplyCaps(conf *Config, caps *specs.LinuxCapabilities) error {
+	setter, err := capability.NewPid2(os.Getpid())
+	if err != nil {
+		return err
+	}
+
+	bounding, err := capsFromNames(caps.Bounding)
+	if err != nil {
+		return err
+	}
+	effective, err := capsFromNames(caps.Effective)
+	if err != nil {
+		return err
+	}
+	permitted, err := capsFromNames(caps.Permitted)
+	if err != nil {
+		return err
+	}
+	inheritable, err := capsFromNames(caps.Inheritable)
+	if err != nil {
+		return err
+	}
+	ambient, err := capsFromNames(caps.Ambient)
+	if err != nil {
+		return err
+	}
+
+	// Ptrace platform requires extra capabilities.
+	if conf.Platform == PlatformPtrace {
+		bounding = append(bounding, capability.CAP_SYS_PTRACE)
+		effective = append(effective, capability.CAP_SYS_PTRACE)
+		permitted = append(permitted, capability.CAP_SYS_PTRACE)
+	}
+
+	setter.Set(capability.BOUNDS, bounding...)
+	setter.Set(capability.PERMITTED, permitted...)
+	setter.Set(capability.INHERITABLE, inheritable...)
+	setter.Set(capability.EFFECTIVE, effective...)
+	setter.Set(capability.AMBIENT, ambient...)
+	return setter.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS)
+}
+
+func capsFromNames(names []string) ([]capability.Cap, error) {
+	var caps []capability.Cap
+	for _, name := range names {
+		cap, ok := capFromName[name]
+		if !ok {
+			return nil, fmt.Errorf("invalid capability %q", name)
+		}
+		caps = append(caps, cap)
+	}
+	return caps, nil
+}
+
+var capFromName = map[string]capability.Cap{
+	"CAP_CHOWN":            capability.CAP_CHOWN,
+	"CAP_DAC_OVERRIDE":     capability.CAP_DAC_OVERRIDE,
+	"CAP_DAC_READ_SEARCH":  capability.CAP_DAC_READ_SEARCH,
+	"CAP_FOWNER":           capability.CAP_FOWNER,
+	"CAP_FSETID":           capability.CAP_FSETID,
+	"CAP_KILL":             capability.CAP_KILL,
+	"CAP_SETGID":           capability.CAP_SETGID,
+	"CAP_SETUID":           capability.CAP_SETUID,
+	"CAP_SETPCAP":          capability.CAP_SETPCAP,
+	"CAP_LINUX_IMMUTABLE":  capability.CAP_LINUX_IMMUTABLE,
+	"CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE,
+	"CAP_NET_BROAD_CAST":   capability.CAP_NET_BROADCAST,
+	"CAP_NET_ADMIN":        capability.CAP_NET_ADMIN,
+	"CAP_NET_RAW":          capability.CAP_NET_RAW,
+	"CAP_IPC_LOCK":         capability.CAP_IPC_LOCK,
+	"CAP_IPC_OWNER":        capability.CAP_IPC_OWNER,
+	"CAP_SYS_MODULE":       capability.CAP_SYS_MODULE,
+	"CAP_SYS_RAWIO":        capability.CAP_SYS_RAWIO,
+	"CAP_SYS_CHROOT":       capability.CAP_SYS_CHROOT,
+	"CAP_SYS_PTRACE":       capability.CAP_SYS_PTRACE,
+	"CAP_SYS_PACCT":        capability.CAP_SYS_PACCT,
+	"CAP_SYS_ADMIN":        capability.CAP_SYS_ADMIN,
+	"CAP_SYS_BOOT":         capability.CAP_SYS_BOOT,
+	"CAP_SYS_NICE":         capability.CAP_SYS_NICE,
+	"CAP_SYS_RESOURCE":     capability.CAP_SYS_RESOURCE,
+	"CAP_SYS_TIME":         capability.CAP_SYS_TIME,
+	"CAP_SYS_TTY_CONFIG":   capability.CAP_SYS_TTY_CONFIG,
+	"CAP_MKNOD":            capability.CAP_MKNOD,
+	"CAP_LEASE":            capability.CAP_LEASE,
+	"CAP_AUDIT_WRITE":      capability.CAP_AUDIT_WRITE,
+	"CAP_AUDIT_CONTROL":    capability.CAP_AUDIT_CONTROL,
+	"CAP_SETFCAP":          capability.CAP_SETFCAP,
+	"CAP_MAC_OVERRIDE":     capability.CAP_MAC_OVERRIDE,
+	"CAP_MAC_ADMIN":        capability.CAP_MAC_ADMIN,
+	"CAP_SYSLOG":           capability.CAP_SYSLOG,
+	"CAP_WAKE_ALARM":       capability.CAP_WAKE_ALARM,
+	"CAP_BLOCK_SUSPEND":    capability.CAP_BLOCK_SUSPEND,
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
new file mode 100644
index 000000000..f3e33e89a
--- /dev/null
+++ b/runsc/boot/config.go
@@ -0,0 +1,162 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import "fmt"
+
+// PlatformType tells which platform to use.
+type PlatformType int
+
+const (
+	// Ptrace runs the sandbox with the ptrace platform.
+	PlatformPtrace PlatformType = iota
+
+	// KVM runs the sandbox with the KVM platform.
+	PlatformKVM
+)
+
+// MakePlatformType converts type from string.
+func MakePlatformType(s string) (PlatformType, error) {
+	switch s {
+	case "ptrace":
+		return PlatformPtrace, nil
+	case "kvm":
+		return PlatformKVM, nil
+	default:
+		return 0, fmt.Errorf("invalid platform type %q", s)
+	}
+}
+
+func (p PlatformType) String() string {
+	switch p {
+	case PlatformPtrace:
+		return "ptrace"
+	case PlatformKVM:
+		return "kvm"
+	default:
+		return fmt.Sprintf("unknown(%d)", p)
+	}
+}
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+	// FileAccessProxy sends IO requests to a Gofer process that validates the
+	// requests and forwards them to the host.
+	FileAccessProxy FileAccessType = iota
+
+	// FileAccessDirect connects the sandbox directly to the host filesystem.
+	FileAccessDirect
+)
+
+// MakeFileAccessType converts type from string.
+func MakeFileAccessType(s string) (FileAccessType, error) {
+	switch s {
+	case "proxy":
+		return FileAccessProxy, nil
+	case "direct":
+		return FileAccessDirect, nil
+	default:
+		return 0, fmt.Errorf("invalid file access type %q", s)
+	}
+}
+
+func (f FileAccessType) String() string {
+	switch f {
+	case FileAccessProxy:
+		return "proxy"
+	case FileAccessDirect:
+		return "direct"
+	default:
+		return fmt.Sprintf("unknown(%d)", f)
+	}
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+	// NetworkSandbox uses internal network stack, isolated from the host.
+	NetworkSandbox NetworkType = iota
+
+	// NetworkHost redirects network related syscalls to the host network.
+	NetworkHost
+
+	// NetworkNone sets up just loopback using netstack.
+	NetworkNone
+)
+
+// MakeNetworkType converts type from string.
+func MakeNetworkType(s string) (NetworkType, error) {
+	switch s {
+	case "sandbox":
+		return NetworkSandbox, nil
+	case "host":
+		return NetworkHost, nil
+	case "none":
+		return NetworkNone, nil
+	default:
+		return 0, fmt.Errorf("invalid network type %q", s)
+	}
+}
+
+func (n NetworkType) String() string {
+	switch n {
+	case NetworkSandbox:
+		return "sandbox"
+	case NetworkHost:
+		return "host"
+	case NetworkNone:
+		return "none"
+	default:
+		return fmt.Sprintf("unknown(%d)", n)
+	}
+}
+
+// Config holds configuration that is not part of the runtime spec.
+type Config struct {
+	// RootDir is the runtime root directory.
+	RootDir string
+
+	// FileAccess indicates how the filesystem is accessed.
+	FileAccess FileAccessType
+
+	// Overlay is whether to wrap the root filesystem in an overlay.
+	Overlay bool
+
+	// Network indicates what type of network to use.
+	Network NetworkType
+
+	// LogPackets indicates that all network packets should be logged.
+	LogPackets bool
+
+	// Platform is the platform to run on.
+	Platform PlatformType
+
+	// Strace indicates that strace should be enabled.
+	Strace bool
+
+	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
+	// true and this list is empty, then all syscalls will be traced.
+	StraceSyscalls []string
+
+	// StraceLogSize is the max size of data blobs to display.
+	StraceLogSize uint
+
+	// DisableSeccomp indicates whether seccomp syscall filters should be
+	// disabled. Pardon the double negation, but default to enabled is important.
+	DisableSeccomp bool
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
new file mode 100644
index 000000000..4d4ef7256
--- /dev/null
+++ b/runsc/boot/controller.go
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+)
+
+const (
+	// ApplicationStart is the URPC endpoint for starting a sandboxed app.
+	ApplicationStart = "application.Start"
+
+	// ApplicationProcesses is the URPC endpoint for getting the list of
+	// processes running in a sandbox.
+	ApplicationProcesses = "application.Processes"
+
+	// ApplicationExecute is the URPC endpoint for executing a command in a
+	// sandbox.
+	ApplicationExecute = "application.Execute"
+
+	// ApplicationEvent is the URPC endpoint for getting stats about the
+	// container used by "runsc events".
+	ApplicationEvent = "application.Event"
+
+	// NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
+	// and routes in a network stack.
+	NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
+)
+
+// ControlSocketAddr generates an abstract unix socket name for the given id.
+func ControlSocketAddr(id string) string {
+	return fmt.Sprintf("\x00runsc-sandbox.%s", id)
+}
+
+// controller holds the control server, and is used for communication into the
+// sandbox.
+type controller struct {
+	// srv is the contorl server.
+	srv *server.Server
+
+	// app holds the application methods.
+	app *application
+}
+
+// newController creates a new controller and starts it listening.
+func newController(fd int, k *kernel.Kernel) (*controller, error) {
+	srv, err := server.CreateFromFD(fd)
+	if err != nil {
+		return nil, err
+	}
+
+	app := &application{
+		startChan:       make(chan struct{}),
+		startResultChan: make(chan error, 1),
+		k:               k,
+	}
+	srv.Register(app)
+
+	if eps, ok := k.NetworkStack().(*epsocket.Stack); ok {
+		net := &Network{
+			Stack: eps.Stack,
+		}
+		srv.Register(net)
+	}
+
+	if err := srv.StartServing(); err != nil {
+		return nil, err
+	}
+
+	return &controller{
+		srv: srv,
+		app: app,
+	}, nil
+}
+
+// application contains methods that control the sandboxed application.
+type application struct {
+	// startChan is used to signal when the application process should be
+	// started.
+	startChan chan struct{}
+
+	// startResultChan is used to signal when the application has started. Any
+	// errors encountered during startup will be sent to the channel. A nil value
+	// indicates success.
+	startResultChan chan error
+
+	// k is the emulated linux kernel on which the sandboxed
+	// application runs.
+	k *kernel.Kernel
+}
+
+// Start will start the application process.
+func (a *application) Start(_, _ *struct{}) error {
+	// Tell the application to start and wait for the result.
+	a.startChan <- struct{}{}
+	return <-a.startResultChan
+}
+
+// Processes retrieves information about processes running in the sandbox.
+func (a *application) Processes(_, out *[]*control.Process) error {
+	return control.Processes(a.k, out)
+}
+
+// Execute runs a command on a created or running sandbox.
+func (a *application) Execute(e *control.ExecArgs, waitStatus *uint32) error {
+	proc := control.Proc{Kernel: a.k}
+	if err := proc.Exec(e, waitStatus); err != nil {
+		return fmt.Errorf("error executing: %+v: %v", e, err)
+	}
+	return nil
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
new file mode 100644
index 000000000..ef6459b01
--- /dev/null
+++ b/runsc/boot/events.go
@@ -0,0 +1,81 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// Event struct for encoding the event data to JSON. Corresponds to runc's
+// main.event struct.
+type Event struct {
+	Type string      `json:"type"`
+	ID   string      `json:"id"`
+	Data interface{} `json:"data,omitempty"`
+}
+
+// Stats is the runc specific stats structure for stability when encoding and
+// decoding stats.
+// TODO: Many fields aren't obtainable due to a lack of cgroups.
+type Stats struct {
+	Memory Memory `json:"memory"`
+	Pids   Pids   `json:"pids"`
+}
+
+// Pids contains stats on processes.
+type Pids struct {
+	Current uint64 `json:"current,omitempty"`
+	Limit   uint64 `json:"limit,omitempty"`
+}
+
+// MemoryEntry contains stats on a kind of memory.
+type MemoryEntry struct {
+	Limit   uint64 `json:"limit"`
+	Usage   uint64 `json:"usage,omitempty"`
+	Max     uint64 `json:"max,omitempty"`
+	Failcnt uint64 `json:"failcnt"`
+}
+
+// Memory contains stats on memory.
+type Memory struct {
+	Cache     uint64            `json:"cache,omitempty"`
+	Usage     MemoryEntry       `json:"usage,omitempty"`
+	Swap      MemoryEntry       `json:"swap,omitempty"`
+	Kernel    MemoryEntry       `json:"kernel,omitempty"`
+	KernelTCP MemoryEntry       `json:"kernelTCP,omitempty"`
+	Raw       map[string]uint64 `json:"raw,omitempty"`
+}
+
+func (a *application) Event(_ *struct{}, out *Event) error {
+	stats := &Stats{}
+	stats.populateMemory(a.k)
+	stats.populatePIDs(a.k)
+	*out = Event{Type: "stats", Data: stats}
+	return nil
+}
+
+func (s *Stats) populateMemory(k *kernel.Kernel) {
+	mem := k.Platform.Memory()
+	mem.UpdateUsage()
+	_, totalUsage := usage.MemoryAccounting.Copy()
+	s.Memory.Usage = MemoryEntry{
+		Usage: totalUsage,
+	}
+}
+
+func (s *Stats) populatePIDs(k *kernel.Kernel) {
+	s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups()))
+}
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
new file mode 100644
index 000000000..0449e243d
--- /dev/null
+++ b/runsc/boot/fds.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// createFDMap creates an fd map that contains stdin, stdout, and stderr. If
+// console is true, then ioctl calls will be passed through to the host fd.
+//
+// TODO: We currently arn't passing any FDs in to the sandbox, so
+// there's not much else for this function to do.  It will get more complicated
+// when gofers enter the picture.  Also the LISTEN_FDS environment variable
+// allows passing arbitrary FDs to the sandbox, which we do not yet support.
+func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool) (*kernel.FDMap, error) {
+	fdm := k.NewFDMap()
+	defer fdm.DecRef()
+
+	// Maps sandbox fd to host fd.
+	fdMap := map[int]int{
+		0: syscall.Stdin,
+		1: syscall.Stdout,
+		2: syscall.Stderr,
+	}
+	mounter := fs.FileOwnerFromContext(ctx)
+
+	for sfd, hfd := range fdMap {
+		file, err := host.ImportFile(ctx, hfd, mounter, console /* allow ioctls */)
+		if err != nil {
+			return nil, fmt.Errorf("failed to import fd %d: %v", hfd, err)
+		}
+		defer file.DecRef()
+		if err := fdm.NewFDAt(kdefs.FD(sfd), file, kernel.FDFlags{}, l); err != nil {
+			return nil, fmt.Errorf("failed to add imported fd %d to FDMap: %v", hfd, err)
+		}
+	}
+
+	fdm.IncRef()
+	return fdm, nil
+}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
new file mode 100644
index 000000000..fd1b18717
--- /dev/null
+++ b/runsc/boot/filter/BUILD
@@ -0,0 +1,26 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "filter",
+    srcs = [
+        "config.go",
+        "extra_filters.go",
+        "extra_filters_msan.go",
+        "extra_filters_race.go",
+        "filter.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/boot/filter",
+    visibility = [
+        "//runsc/boot:__subpackages__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//pkg/seccomp",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
new file mode 100644
index 000000000..130e987df
--- /dev/null
+++ b/runsc/boot/filter/config.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+)
+
+// allowedSyscalls is the set of syscalls executed by the Sentry
+// to the host OS.
+var allowedSyscalls = []uintptr{
+	syscall.SYS_ACCEPT,
+	syscall.SYS_ARCH_PRCTL,
+	syscall.SYS_CLOCK_GETTIME,
+	syscall.SYS_CLONE,
+	syscall.SYS_CLOSE,
+	syscall.SYS_DUP,
+	syscall.SYS_DUP2,
+	syscall.SYS_EPOLL_CREATE1,
+	syscall.SYS_EPOLL_CTL,
+	syscall.SYS_EPOLL_PWAIT,
+	syscall.SYS_EPOLL_WAIT,
+	syscall.SYS_EVENTFD2,
+	syscall.SYS_EXIT,
+	syscall.SYS_EXIT_GROUP,
+	syscall.SYS_FALLOCATE,
+	syscall.SYS_FCHMOD,
+	syscall.SYS_FCNTL,
+	syscall.SYS_FSTAT,
+	syscall.SYS_FSYNC,
+	syscall.SYS_FTRUNCATE,
+	syscall.SYS_FUTEX,
+	syscall.SYS_GETDENTS64,
+	syscall.SYS_GETPID,
+	unix.SYS_GETRANDOM,
+	syscall.SYS_GETSOCKOPT,
+	syscall.SYS_GETTID,
+	syscall.SYS_GETTIMEOFDAY,
+	syscall.SYS_LISTEN,
+	syscall.SYS_LSEEK,
+	syscall.SYS_MADVISE,
+	syscall.SYS_MINCORE,
+	syscall.SYS_MMAP,
+	syscall.SYS_MPROTECT,
+	syscall.SYS_MUNMAP,
+	syscall.SYS_NEWFSTATAT,
+	syscall.SYS_POLL,
+	syscall.SYS_PREAD64,
+	syscall.SYS_PSELECT6,
+	syscall.SYS_PWRITE64,
+	syscall.SYS_READ,
+	syscall.SYS_READLINKAT,
+	syscall.SYS_READV,
+	syscall.SYS_RECVMSG,
+	syscall.SYS_RENAMEAT,
+	syscall.SYS_RESTART_SYSCALL,
+	syscall.SYS_RT_SIGACTION,
+	syscall.SYS_RT_SIGPROCMASK,
+	syscall.SYS_RT_SIGRETURN,
+	syscall.SYS_SCHED_YIELD,
+	syscall.SYS_SENDMSG,
+	syscall.SYS_SETITIMER,
+	syscall.SYS_SHUTDOWN,
+	syscall.SYS_SIGALTSTACK,
+	syscall.SYS_SYNC_FILE_RANGE,
+	syscall.SYS_TGKILL,
+	syscall.SYS_UTIMENSAT,
+	syscall.SYS_WRITE,
+	syscall.SYS_WRITEV,
+}
+
+// TODO: Ioctl is needed in order to support tty consoles.
+// Once filters support argument-checking, we should only allow ioctl
+// with tty-related arguments.
+func consoleFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_IOCTL,
+	}
+}
+
+// whitelistFSFilters returns syscalls made by whitelistFS. Using WhitelistFS
+// is less secure because it runs inside the Sentry and must be able to perform
+// file operations that would otherwise be disabled by seccomp when a Gofer is
+// used. When whitelistFS is not used, openning new FD in the Sentry is
+// disallowed.
+func whitelistFSFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_ACCESS,
+		syscall.SYS_FCHMOD,
+		syscall.SYS_FSTAT,
+		syscall.SYS_FSYNC,
+		syscall.SYS_FTRUNCATE,
+		syscall.SYS_GETCWD,
+		syscall.SYS_GETDENTS,
+		syscall.SYS_GETDENTS64,
+		syscall.SYS_LSEEK,
+		syscall.SYS_LSTAT,
+		syscall.SYS_MKDIR,
+		syscall.SYS_MKDIRAT,
+		syscall.SYS_NEWFSTATAT,
+		syscall.SYS_OPEN,
+		syscall.SYS_OPENAT,
+		syscall.SYS_PREAD64,
+		syscall.SYS_PWRITE64,
+		syscall.SYS_READ,
+		syscall.SYS_READLINK,
+		syscall.SYS_READLINKAT,
+		syscall.SYS_RENAMEAT,
+		syscall.SYS_STAT,
+		syscall.SYS_SYMLINK,
+		syscall.SYS_SYMLINKAT,
+		syscall.SYS_SYNC_FILE_RANGE,
+		syscall.SYS_UNLINK,
+		syscall.SYS_UNLINKAT,
+		syscall.SYS_UTIMENSAT,
+		syscall.SYS_WRITE,
+	}
+}
+
+// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
+func hostInetFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_ACCEPT4,
+		syscall.SYS_BIND,
+		syscall.SYS_CONNECT,
+		syscall.SYS_GETPEERNAME,
+		syscall.SYS_GETSOCKNAME,
+		syscall.SYS_GETSOCKOPT,
+		syscall.SYS_IOCTL,
+		syscall.SYS_LISTEN,
+		syscall.SYS_READV,
+		syscall.SYS_RECVFROM,
+		syscall.SYS_RECVMSG,
+		syscall.SYS_SENDMSG,
+		syscall.SYS_SENDTO,
+		syscall.SYS_SETSOCKOPT,
+		syscall.SYS_SHUTDOWN,
+		syscall.SYS_SOCKET,
+		syscall.SYS_WRITEV,
+	}
+}
+
+// ptraceFilters returns syscalls made exclusively by the ptrace platform.
+func ptraceFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_PTRACE,
+		syscall.SYS_WAIT4,
+		unix.SYS_GETCPU,
+		unix.SYS_SCHED_SETAFFINITY,
+	}
+}
+
+// kvmFilters returns syscalls made exclusively by the KVM platform.
+func kvmFilters() []uintptr {
+	return []uintptr{
+		syscall.SYS_IOCTL,
+		syscall.SYS_RT_SIGSUSPEND,
+		syscall.SYS_RT_SIGTIMEDWAIT,
+		0xffffffffffffffff, // KVM uses syscall -1 to transition to host.
+	}
+}
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
new file mode 100644
index 000000000..e10d9bf4c
--- /dev/null
+++ b/runsc/boot/filter/extra_filters.go
@@ -0,0 +1,24 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go intrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() []uintptr {
+	return nil
+}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
new file mode 100644
index 000000000..a862340f6
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+	"syscall"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() []uintptr {
+	Report("MSAN is enabled: syscall filters less restrictive!")
+	return []uintptr{
+		syscall.SYS_SCHED_GETAFFINITY,
+		syscall.SYS_SET_ROBUST_LIST,
+	}
+}
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
new file mode 100644
index 000000000..b0c74a58a
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+	"syscall"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() []uintptr {
+	Report("TSAN is enabled: syscall filters less restrictive!")
+	return []uintptr{
+		syscall.SYS_BRK,
+		syscall.SYS_MUNLOCK,
+		syscall.SYS_NANOSLEEP,
+		syscall.SYS_OPEN,
+		syscall.SYS_SET_ROBUST_LIST,
+	}
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
new file mode 100644
index 000000000..3ba56a318
--- /dev/null
+++ b/runsc/boot/filter/filter.go
@@ -0,0 +1,67 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the sandbox is allowed to make
+// to the host, and installs seccomp filters to prevent prohibited
+// syscalls in case it's compromised.
+package filter
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+)
+
+// Install installs seccomp filters for based on the given platform.
+func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error {
+	s := allowedSyscalls
+
+	// Set of additional filters used by -race and -msan. Returns empty
+	// when not enabled.
+	s = append(s, instrumentationFilters()...)
+
+	if whitelistFS {
+		Report("direct file access allows unrestricted file access!")
+		s = append(s, whitelistFSFilters()...)
+	}
+	if console {
+		Report("console is enabled: syscall filters less restrictive!")
+		s = append(s, consoleFilters()...)
+	}
+	if hostNetwork {
+		Report("host networking enabled: syscall filters less restrictive!")
+		s = append(s, hostInetFilters()...)
+	}
+
+	switch p := p.(type) {
+	case *ptrace.PTrace:
+		s = append(s, ptraceFilters()...)
+	case *kvm.KVM:
+		s = append(s, kvmFilters()...)
+	default:
+		return fmt.Errorf("unknown platform type %T", p)
+	}
+
+	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
+	return seccomp.Install(s, false)
+}
+
+// Report writes a warning message to the log.
+func Report(msg string) {
+	log.Warningf("*** SECCOMP WARNING: %s", msg)
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
new file mode 100644
index 000000000..2073bd0b1
--- /dev/null
+++ b/runsc/boot/fs.go
@@ -0,0 +1,441 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	// Include filesystem types that OCI spec might mount.
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type fdDispenser struct {
+	fds []int
+}
+
+func (f *fdDispenser) remove() int {
+	rv := f.fds[0]
+	f.fds = f.fds[1:]
+	return rv
+}
+
+func (f *fdDispenser) empty() bool {
+	return len(f.fds) == 0
+}
+
+// createMountNamespace creates a mount manager containing the root filesystem
+// and all mounts.
+func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) {
+	fds := &fdDispenser{fds: ioFDs}
+
+	// Create the MountNamespace from the root.
+	rootInode, err := createRootMount(ctx, spec, conf, fds)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create root overlay: %v", err)
+	}
+	mns, err := fs.NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		return nil, fmt.Errorf("failed to construct MountNamespace: %v", err)
+	}
+
+	// Keep track of whether proc, sys, and tmp were mounted.
+	var procMounted, sysMounted, tmpMounted bool
+
+	// Mount all submounts from the spec.
+	for _, m := range spec.Mounts {
+		// OCI spec uses many different mounts for the things inside of '/dev'. We
+		// have a single mount at '/dev' that is always mounted, regardless of
+		// whether it was asked for, as the spec says we SHOULD.
+		if strings.HasPrefix(m.Destination, "/dev") {
+			log.Warningf("ignoring dev mount at %q", m.Destination)
+			continue
+		}
+		switch m.Destination {
+		case "/proc":
+			procMounted = true
+		case "/sys":
+			sysMounted = true
+		case "/tmp":
+			tmpMounted = true
+		}
+
+		if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil {
+			return nil, err
+		}
+	}
+
+	// Always mount /dev.
+	if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+		Type:        "devtmpfs",
+		Destination: "/dev",
+	}); err != nil {
+		return nil, err
+	}
+
+	// Mount proc and sys even if the user did not ask for it, as the spec
+	// says we SHOULD.
+	if !procMounted {
+		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+			Type:        "proc",
+			Destination: "/proc",
+		}); err != nil {
+			return nil, err
+		}
+	}
+	if !sysMounted {
+		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+			Type:        "sysfs",
+			Destination: "/sys",
+		}); err != nil {
+			return nil, err
+		}
+	}
+
+	// Technically we don't have to mount tmpfs at /tmp, as we could just
+	// rely on the host /tmp, but this is a nice optimization, and fixes
+	// some apps that call mknod in /tmp.
+	if !tmpMounted {
+		if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{
+			Type:        "tmpfs",
+			Destination: "/tmp",
+		}); err != nil {
+			return nil, err
+		}
+	}
+
+	if !fds.empty() {
+		return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
+	}
+
+	return mns, nil
+}
+
+// createRootMount creates the root filesystem.
+func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) {
+	// First construct the filesystem from the spec.Root.
+	mf := fs.MountSourceFlags{
+		ReadOnly: spec.Root.Readonly,
+		NoAtime:  true,
+	}
+
+	var (
+		rootInode *fs.Inode
+		err       error
+	)
+	switch conf.FileAccess {
+	case FileAccessProxy:
+		fd := fds.remove()
+		log.Infof("Mounting root over 9P, ioFD: %d", fd)
+		hostFS := mustFindFilesystem("9p")
+		rootInode, err = hostFS.Mount(ctx, "root", mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd))
+		if err != nil {
+			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
+		}
+
+	case FileAccessDirect:
+		hostFS := mustFindFilesystem("whitelistfs")
+		rootInode, err = hostFS.Mount(ctx, "root", mf, "root="+spec.Root.Path+",dont_translate_ownership=true")
+		if err != nil {
+			return nil, fmt.Errorf("failed to generate root mount point: %v", err)
+		}
+
+	default:
+		return nil, fmt.Errorf("invalid file access type: %v", conf.FileAccess)
+	}
+
+	// We need to overlay the root on top of a ramfs with stub directories
+	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
+	// mounted even if they are not in the spec.
+	submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp")
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("error adding submount overlay: %v", err)
+	}
+
+	if conf.Overlay {
+		log.Debugf("Adding overlay on top of root mount")
+		// Overlay a tmpfs filesystem on top of the root.
+		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	log.Infof("Mounted %q to \"/\" type root", spec.Root.Path)
+	return rootInode, nil
+}
+
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+	// Upper layer uses the same flags as lower, but it must be read-write.
+	lowerFlags.ReadOnly = false
+
+	tmpFS := mustFindFilesystem("tmpfs")
+	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
+	if err != nil {
+		return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err)
+	}
+	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
+}
+
+func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	var data []string
+	var fsName string
+	var useOverlay bool
+	switch m.Type {
+	case "proc", "sysfs", "devtmpfs":
+		fsName = m.Type
+	case "none":
+		fsName = "sysfs"
+	case "tmpfs":
+		fsName = m.Type
+
+		// tmpfs has some extra supported options that we must pass through.
+		var err error
+		data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+		if err != nil {
+			return err
+		}
+	case "bind":
+		switch conf.FileAccess {
+		case FileAccessProxy:
+			fd := fds.remove()
+			fsName = "9p"
+			data = []string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}
+		case FileAccessDirect:
+			fsName = "whitelistfs"
+			data = []string{"root=" + m.Source, "dont_translate_ownership=true"}
+		default:
+			return fmt.Errorf("invalid file access type: %v", conf.FileAccess)
+		}
+
+		fi, err := os.Stat(m.Source)
+		if err != nil {
+			return err
+		}
+		// Add overlay to all writable mounts, except when mapping an individual file.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly && fi.Mode().IsDir()
+	default:
+		// TODO: Support all the mount types and make this a
+		// fatal error.  Most applications will "just work" without
+		// them, so this is a warning for now.
+		// we do not support.
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+		return nil
+	}
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(m.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+	mf.NoAtime = true
+
+	inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ","))
+	if err != nil {
+		return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
+	}
+
+	// If there are submounts, we need to overlay the mount on top of a
+	// ramfs with stub directories for submount paths.
+	//
+	// We do not do this for /dev, since there will usually be submounts in
+	// the spec, but our devfs implementation contains all the necessary
+	// directories and files (well, most of them anyways).
+	if m.Destination != "/dev" {
+		submounts := subtargets(m.Destination, spec.Mounts)
+		if len(submounts) > 0 {
+			log.Infof("Adding submount overlay over %q", m.Destination)
+			inode, err = addSubmountOverlay(ctx, inode, submounts)
+			if err != nil {
+				return fmt.Errorf("error adding submount overlay: %v", err)
+			}
+		}
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of mount %q", m.Destination)
+		if inode, err = addOverlay(ctx, conf, inode, m.Type, mf); err != nil {
+			return err
+		}
+	}
+
+	root := mns.Root()
+	defer root.DecRef()
+	dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err)
+	}
+	defer dirent.DecRef()
+	if err := mns.Mount(ctx, dirent, inode); err != nil {
+		return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err)
+	}
+
+	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	return nil
+}
+
+func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
+	root := mns.Root()
+	defer root.DecRef()
+
+	// Starting at the root, walk the path.
+	parent := root
+	ps := strings.Split(filepath.Clean(path), string(filepath.Separator))
+	for i := 0; i < len(ps); i++ {
+		if ps[i] == "" {
+			// This will be case for the first and last element, if the path
+			// begins or ends with '/'. Note that we always treat the path as
+			// absolute, regardless of what the first character contains.
+			continue
+		}
+		d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit)
+		if err == syserror.ENOENT {
+			// If we encounter a path that does not exist, then
+			// create it.
+			if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil {
+				return fmt.Errorf("failed to create directory %q: %v", ps[i], err)
+			}
+			if d, err = parent.Walk(ctx, root, ps[i]); err != nil {
+				return fmt.Errorf("walk to %q failed: %v", ps[i], err)
+			}
+		} else if err != nil {
+			return fmt.Errorf("failed to find inode %q: %v", ps[i], err)
+		}
+		parent = d
+	}
+	return nil
+}
+
+// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
+// keys.
+func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
+	var out []string
+	for _, o := range opts {
+		kv := strings.Split(o, "=")
+		switch len(kv) {
+		case 1:
+			if contains(allowedKeys, o) {
+				out = append(out, o)
+				continue
+			}
+			log.Warningf("ignoring unsupported key %q", kv)
+		case 2:
+			if contains(allowedKeys, kv[0]) {
+				out = append(out, o)
+				continue
+			}
+			log.Warningf("ignoring unsupported key %q", kv[0])
+		default:
+			return nil, fmt.Errorf("invalid option %q", o)
+		}
+	}
+	return out, nil
+}
+
+func destinations(mounts []specs.Mount, extra ...string) []string {
+	var ds []string
+	for _, m := range mounts {
+		ds = append(ds, m.Destination)
+	}
+	return append(ds, extra...)
+}
+
+func mountFlags(opts []string) fs.MountSourceFlags {
+	mf := fs.MountSourceFlags{}
+	for _, o := range opts {
+		switch o {
+		case "ro":
+			mf.ReadOnly = true
+		case "noatime":
+			mf.NoAtime = true
+		default:
+			log.Warningf("ignorning unknown mount option %q", o)
+		}
+	}
+	return mf
+}
+
+func contains(strs []string, str string) bool {
+	for _, s := range strs {
+		if s == str {
+			return true
+		}
+	}
+	return false
+}
+
+func mustFindFilesystem(name string) fs.Filesystem {
+	fs, ok := fs.FindFilesystem(name)
+	if !ok {
+		panic(fmt.Sprintf("could not find filesystem %q", name))
+	}
+	return fs
+}
+
+// addSubmountOverlay overlays the inode over a ramfs tree containing the given
+// paths.
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+	// There is no real filesystem backing this ramfs tree, so we pass in
+	// "nil" here.
+	mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts)
+	if err != nil {
+		return nil, fmt.Errorf("error creating mount tree: %v", err)
+	}
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to make mount overlay: %v", err)
+	}
+	return overlayInode, err
+}
+
+// subtargets takes a set of Mounts and returns only the targets that are
+// children of the given root. The returned paths are relative to the root.
+func subtargets(root string, mnts []specs.Mount) []string {
+	r := filepath.Clean(root)
+	var targets []string
+	for _, mnt := range mnts {
+		t := filepath.Clean(mnt.Destination)
+		if strings.HasPrefix(t, r) {
+			// Make the mnt path relative to the root path.  If the
+			// result is empty, then mnt IS the root mount, not a
+			// submount.  We don't want to include those.
+			if t := strings.TrimPrefix(t, r); t != "" {
+				targets = append(targets, t)
+			}
+		}
+	}
+	return targets
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
new file mode 100644
index 000000000..ea72de8e9
--- /dev/null
+++ b/runsc/boot/limits.go
@@ -0,0 +1,60 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// Mapping from linux resource names to limits.LimitType.
+var fromLinuxResource = map[string]limits.LimitType{
+	"RLIMIT_CPU":        limits.CPU,
+	"RLIMIT_FSIZE":      limits.FileSize,
+	"RLIMIT_DATA":       limits.Data,
+	"RLIMIT_STACK":      limits.Stack,
+	"RLIMIT_CORE":       limits.Core,
+	"RLIMIT_RSS":        limits.Rss,
+	"RLIMIT_NPROC":      limits.ProcessCount,
+	"RLIMIT_NOFILE":     limits.NumberOfFiles,
+	"RLIMIT_MEMLOCK":    limits.MemoryPagesLocked,
+	"RLIMIT_AS":         limits.AS,
+	"RLIMIT_LOCKS":      limits.Locks,
+	"RLIMIT_SIGPENDING": limits.SignalsPending,
+	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
+	"RLIMIT_NICE":       limits.Nice,
+	"RLIMIT_RTPRIO":     limits.RealTimePriority,
+	"RLIMIT_RTTIME":     limits.Rttime,
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+	ls, err := limits.NewLinuxDistroLimitSet()
+	if err != nil {
+		return nil, err
+	}
+	for _, rl := range spec.Process.Rlimits {
+		lt, ok := fromLinuxResource[rl.Type]
+		if !ok {
+			return nil, fmt.Errorf("unknown resource %q", rl.Type)
+		}
+		ls.SetUnchecked(lt, limits.Limit{
+			Cur: rl.Soft,
+			Max: rl.Hard,
+		})
+	}
+	return ls, nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
new file mode 100644
index 000000000..a470cb054
--- /dev/null
+++ b/runsc/boot/loader.go
@@ -0,0 +1,354 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package boot loads the kernel and runs the application.
+package boot
+
+import (
+	"fmt"
+	"math/rand"
+	"sync/atomic"
+	"syscall"
+	gtime "time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
+	slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.googlesource.com/gvisor/runsc/boot/filter"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+
+	// Include supported socket providers.
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+)
+
+// Loader keeps state needed to start the kernel and run the application.
+type Loader struct {
+	// k is the kernel.
+	k *kernel.Kernel
+
+	// ctrl is the control server.
+	ctrl *controller
+
+	conf *Config
+
+	// console is set to true if terminal is enabled.
+	console bool
+
+	watchdog *watchdog.Watchdog
+
+	// stopSignalForwarding disables forwarding of signals to the sandboxed
+	// app. It should be called when a sandbox is destroyed.
+	stopSignalForwarding func()
+
+	// procArgs refers to the initial application task.
+	procArgs kernel.CreateProcessArgs
+}
+
+func init() {
+	// Initialize the random number generator.
+	rand.Seed(gtime.Now().UnixNano())
+
+	// Register the global syscall table.
+	kernel.RegisterSyscallTable(slinux.AMD64)
+}
+
+// New initializes a new kernel loader configured by spec.
+func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) {
+	// Create kernel and platform.
+	p, err := createPlatform(conf)
+	if err != nil {
+		return nil, fmt.Errorf("error creating platform: %v", err)
+	}
+	k := &kernel.Kernel{
+		Platform: p,
+	}
+
+	// Create VDSO.
+	vdso, err := loader.PrepareVDSO(p)
+	if err != nil {
+		return nil, fmt.Errorf("error creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("error creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	// Create initial limits.
+	ls, err := createLimitSet(spec)
+	if err != nil {
+		return nil, fmt.Errorf("error creating limits: %v", err)
+	}
+
+	// Create capabilities.
+	caps, err := specutils.Capabilities(spec.Process.Capabilities)
+	if err != nil {
+		return nil, fmt.Errorf("error creating capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+	for _, GID := range spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials.
+	creds := auth.NewUserCredentials(
+		auth.KUID(spec.Process.User.UID),
+		auth.KGID(spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		auth.NewRootUserNamespace())
+	if err != nil {
+		return nil, fmt.Errorf("error creating credentials: %v", err)
+	}
+
+	// Create user namespace.
+	// TODO: Not clear what domain name should be here.  It is
+	// not configurable from runtime spec.
+	utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
+
+	ipcns := kernel.NewIPCNamespace()
+
+	if err := enableStrace(conf); err != nil {
+		return nil, fmt.Errorf("failed to enable strace: %v", err)
+	}
+
+	// Get the executable path, which is a bit tricky because we have to
+	// inspect the environment PATH which is relative to the root path.
+	exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
+	if err != nil {
+		return nil, fmt.Errorf("error getting executable path: %v", err)
+	}
+
+	// Create the process arguments.
+	procArgs := kernel.CreateProcessArgs{
+		Filename:         exec,
+		Argv:             spec.Process.Args,
+		Envv:             spec.Process.Env,
+		WorkingDirectory: spec.Process.Cwd,
+		Credentials:      creds,
+		// Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so
+		// it must wait until we have a Kernel.
+		Umask:                uint(syscall.Umask(0)),
+		Limits:               ls,
+		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+		UTSNamespace:         utsns,
+		IPCNamespace:         ipcns,
+	}
+
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
+	networkStack := newEmptyNetworkStack(conf)
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		FeatureSet:        cpuid.HostFeatureSet(),
+		Timekeeper:        tk,
+		RootUserNamespace: creds.UserNamespace,
+		NetworkStack:      networkStack,
+		ApplicationCores:  8,
+		Vdso:              vdso,
+		RootUTSNamespace:  utsns,
+		RootIPCNamespace:  ipcns,
+	}); err != nil {
+		return nil, fmt.Errorf("error initializing kernel: %v", err)
+	}
+
+	// Turn on packet logging if enabled.
+	if conf.LogPackets {
+		log.Infof("Packet logging enabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 1)
+	} else {
+		log.Infof("Packet logging disabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 0)
+	}
+
+	// Create the control server using the provided FD.
+	//
+	// This must be done *after* we have initialized the kernel since the
+	// controller is used to configure the kernel's network stack.
+	//
+	// This should also be *before* we create the process, since a
+	// misconfigured process will cause an error, and we want the control
+	// server up before that so that we don't time out trying to connect to
+	// it.
+	ctrl, err := newController(controllerFD, k)
+	if err != nil {
+		return nil, fmt.Errorf("error creating control server: %v", err)
+	}
+
+	ctx := procArgs.NewContext(k)
+
+	// Create the virtual filesystem.
+	mm, err := createMountNamespace(ctx, spec, conf, ioFDs)
+	if err != nil {
+		return nil, fmt.Errorf("error creating mounts: %v", err)
+	}
+	k.SetRootMountNamespace(mm)
+
+	// Create the FD map, which will set stdin, stdout, and stderr.  If console
+	// is true, then ioctl calls will be passed through to the host fd.
+	fdm, err := createFDMap(ctx, k, ls, console)
+	if err != nil {
+		return nil, fmt.Errorf("error importing fds: %v", err)
+	}
+
+	// CreateProcess takes a reference on FDMap if successful. We
+	// won't need ours either way.
+	procArgs.FDMap = fdm
+
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+	}
+	// Ensure that most signals received in sentry context are forwarded to
+	// the emulated kernel.
+	stopSignalForwarding := sighandling.StartForwarding(k)
+
+	watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning)
+	return &Loader{
+		k:                    k,
+		ctrl:                 ctrl,
+		conf:                 conf,
+		console:              console,
+		watchdog:             watchdog,
+		stopSignalForwarding: stopSignalForwarding,
+		procArgs:             procArgs,
+	}, nil
+}
+
+// Destroy cleans up all resources used by the loader.
+func (l *Loader) Destroy() {
+	if l.ctrl != nil {
+		// Shut down control server.
+		l.ctrl.srv.Stop()
+	}
+	l.stopSignalForwarding()
+	l.watchdog.Stop()
+}
+
+func createPlatform(conf *Config) (platform.Platform, error) {
+	switch conf.Platform {
+	case PlatformPtrace:
+		log.Infof("Platform: ptrace")
+		return ptrace.New()
+	case PlatformKVM:
+		log.Infof("Platform: kvm")
+		return kvm.New()
+	default:
+		return nil, fmt.Errorf("invalid platform %v", conf.Platform)
+	}
+}
+
+// Run runs the application.
+func (l *Loader) Run() error {
+	err := l.run()
+	l.ctrl.app.startResultChan <- err
+	return err
+}
+
+func (l *Loader) run() error {
+	if l.conf.Network == NetworkHost {
+		// Delay host network configuration to this point because network namespace
+		// is configured after the loader is created and before Run() is called.
+		log.Debugf("Configuring host network")
+		stack := l.k.NetworkStack().(*hostinet.Stack)
+		if err := stack.Configure(); err != nil {
+			return err
+		}
+	}
+
+	// Finally done with all configuration. Setup filters before user code
+	// is loaded.
+	if l.conf.DisableSeccomp {
+		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+	} else {
+		whitelistFS := l.conf.FileAccess == FileAccessDirect
+		hostNet := l.conf.Network == NetworkHost
+		if err := filter.Install(l.k.Platform, whitelistFS, l.console, hostNet); err != nil {
+			return fmt.Errorf("Failed to install seccomp filters: %v", err)
+		}
+	}
+
+	// Create the initial application task.
+	if _, err := l.k.CreateProcess(l.procArgs); err != nil {
+		return fmt.Errorf("failed to create init process: %v", err)
+	}
+
+	// CreateProcess takes a reference on FDMap if successful.
+	l.procArgs.FDMap.DecRef()
+
+	l.watchdog.Start()
+	return l.k.Start()
+}
+
+// WaitForStartSignal waits for a start signal from the control server.
+func (l *Loader) WaitForStartSignal() {
+	<-l.ctrl.app.startChan
+}
+
+// WaitExit waits for the application to exit, and returns the application's
+// exit status.
+func (l *Loader) WaitExit() kernel.ExitStatus {
+	// Wait for application.
+	l.k.WaitExited()
+
+	return l.k.GlobalInit().ExitStatus()
+}
+
+func newEmptyNetworkStack(conf *Config) inet.Stack {
+	switch conf.Network {
+	case NetworkHost:
+		return hostinet.NewStack()
+
+	case NetworkNone, NetworkSandbox:
+		// NetworkNone sets up loopback using netstack.
+		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
+		protoNames := []string{tcp.ProtocolName, udp.ProtocolName}
+		return &epsocket.Stack{stack.New(netProtos, protoNames)}
+
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
+}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
new file mode 100644
index 000000000..2fc16b241
--- /dev/null
+++ b/runsc/boot/loader_test.go
@@ -0,0 +1,238 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+}
+
+// testSpec returns a simple spec that can be used in tests.
+func testSpec() *specs.Spec {
+	return &specs.Spec{
+		// The host filesystem root is the sandbox root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: []string{"/bin/true"},
+		},
+	}
+}
+
+func createLoader() (*Loader, error) {
+	fd, err := server.CreateSocket(ControlSocketAddr("123"))
+	if err != nil {
+		return nil, err
+	}
+	conf := &Config{
+		RootDir:        "unused_root_dir",
+		Network:        NetworkNone,
+		FileAccess:     FileAccessDirect,
+		DisableSeccomp: true,
+	}
+	return New(testSpec(), conf, fd, nil, false)
+}
+
+// TestRun runs a simple application in a sandbox and checks that it succeeds.
+func TestRun(t *testing.T) {
+	s, err := createLoader()
+	if err != nil {
+		t.Fatalf("error creating loader: %v", err)
+	}
+	defer s.Destroy()
+
+	// Run the application.
+	if err := s.Run(); err != nil {
+		t.Errorf("error running application: %v", err)
+	}
+
+	// Wait for the application to exit.  It should succeed.
+	if status := s.WaitExit(); status.Code != 0 || status.Signo != 0 {
+		t.Errorf("application exited with status %+v, want 0", status)
+	}
+}
+
+// TestStartSignal tests that the controller Start message will cause
+// WaitForStartSignal to return.
+func TestStartSignal(t *testing.T) {
+	s, err := createLoader()
+	if err != nil {
+		t.Fatalf("error creating loader: %v", err)
+	}
+	defer s.Destroy()
+
+	// We aren't going to wait on this application, so the control server
+	// needs to be shut down manually.
+	defer s.ctrl.srv.Stop()
+
+	// Start a goroutine that calls WaitForStartSignal and writes to a
+	// channel when it returns.
+	waitFinished := make(chan struct{})
+	go func() {
+		s.WaitForStartSignal()
+		// Pretent that Run() executed and returned no error.
+		s.ctrl.app.startResultChan <- nil
+		waitFinished <- struct{}{}
+	}()
+
+	// Nothing has been written to the channel, so waitFinished should not
+	// return.  Give it a little bit of time to make sure the goroutine has
+	// started.
+	select {
+	case <-waitFinished:
+		t.Errorf("WaitForStartSignal completed but it should not have")
+	case <-time.After(50 * time.Millisecond):
+		// OK.
+	}
+
+	// Trigger the control server Start method.
+	if err := s.ctrl.app.Start(nil, nil); err != nil {
+		t.Errorf("error calling Start: %v", err)
+	}
+
+	// Now WaitForStartSignal should return (within a short amount of
+	// time).
+	select {
+	case <-waitFinished:
+		// OK.
+	case <-time.After(50 * time.Millisecond):
+		t.Errorf("WaitForStartSignal did not complete but it should have")
+	}
+
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespace(t *testing.T) {
+	conf := &Config{
+		RootDir:        "unused_root_dir",
+		FileAccess:     FileAccessDirect,
+		DisableSeccomp: true,
+	}
+
+	testCases := []struct {
+		name string
+		// Spec that will be used to create the mount manager.  Note
+		// that we can't mount procfs without a kernel, so each spec
+		// MUST contain something other than procfs mounted at /proc.
+		spec specs.Spec
+		// Paths that are expected to exist in the resulting fs.
+		expectedPaths []string
+	}{
+		{
+			// Only proc.
+			name: "only proc mount",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			// /proc, /dev, and /sys should always be mounted.
+			expectedPaths: []string{"/proc", "/dev", "/sys"},
+		},
+		{
+			// Mount at a deep path, with many components that do
+			// not exist in the root.
+			name: "deep mount path",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			// /some/deep/path should be mounted, along with /proc,
+			// /dev, and /sys.
+			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+		},
+		{
+			// Mounts are nested inside eachother.
+			name: "nested mounts",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/bar",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/bar/baz",
+						Type:        "tmpfs",
+					},
+					{
+						// A deep path that is in foo but not the other mounts.
+						Destination: "/foo/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+		},
+	}
+
+	for _, tc := range testCases {
+		ctx := contexttest.Context(t)
+		mm, err := createMountNamespace(ctx, &tc.spec, conf, nil)
+		if err != nil {
+			t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
+		}
+		defer mm.DecRef()
+		root := mm.Root()
+		defer root.DecRef()
+		for _, p := range tc.expectedPaths {
+			if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil {
+				t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+			}
+		}
+	}
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
new file mode 100644
index 000000000..d2b52c823
--- /dev/null
+++ b/runsc/boot/network.go
@@ -0,0 +1,213 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"math/rand"
+	"net"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+// Network exposes methods that can be used to configure a network stack.
+type Network struct {
+	Stack *stack.Stack
+}
+
+// Route represents a route in the network stack.
+type Route struct {
+	Destination net.IP
+	Mask        net.IPMask
+	Gateway     net.IP
+}
+
+// DefaultRoute represents a catch all route to the default gateway.
+type DefaultRoute struct {
+	Route Route
+	Name  string
+}
+
+// FDBasedLink configures an fd-based link.
+type FDBasedLink struct {
+	Name      string
+	MTU       int
+	Addresses []net.IP
+	Routes    []Route
+}
+
+// LoopbackLink configures a loopback li nk.
+type LoopbackLink struct {
+	Name      string
+	Addresses []net.IP
+	Routes    []Route
+}
+
+// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
+type CreateLinksAndRoutesArgs struct {
+	// FilePayload contains the fds associated with the FDBasedLinks.  The
+	// two slices must have the same length.
+	urpc.FilePayload
+
+	LoopbackLinks []LoopbackLink
+	FDBasedLinks  []FDBasedLink
+
+	DefaultGateway DefaultRoute
+}
+
+// Empty returns true if route hasn't been set.
+func (r *Route) Empty() bool {
+	return r.Destination == nil && r.Mask == nil && r.Gateway == nil
+}
+
+func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
+	return tcpip.Route{
+		Destination: ipToAddress(r.Destination),
+		Gateway:     ipToAddress(r.Gateway),
+		Mask:        ipToAddress(net.IP(r.Mask)),
+		NIC:         id,
+	}
+}
+
+// CreateLinksAndRoutes creates links and routes in a network stack.  It should
+// only be called once.
+func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
+	if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
+		return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+	}
+
+	var nicID tcpip.NICID
+	nicids := make(map[string]tcpip.NICID)
+
+	// Collect routes from all links.
+	var routes []tcpip.Route
+
+	// Loopback normally appear before other interfaces.
+	for _, link := range args.LoopbackLinks {
+		nicID++
+		nicids[link.Name] = nicID
+
+		linkEP := loopback.New()
+
+		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+			return err
+		}
+
+		// Collect the routes from this link.
+		for _, r := range link.Routes {
+			routes = append(routes, r.toTcpipRoute(nicID))
+		}
+	}
+
+	for i, link := range args.FDBasedLinks {
+		nicID++
+		nicids[link.Name] = nicID
+
+		// Copy the underlying FD.
+		oldFD := args.FilePayload.Files[i].Fd()
+		newFD, err := syscall.Dup(int(oldFD))
+		if err != nil {
+			return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+		}
+
+		linkEP := fdbased.New(&fdbased.Options{
+			FD:              newFD,
+			MTU:             uint32(link.MTU),
+			ChecksumOffload: false,
+			EthernetHeader:  true,
+			Address:         tcpip.LinkAddress(generateRndMac()),
+		})
+
+		log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+			return err
+		}
+
+		// Collect the routes from this link.
+		for _, r := range link.Routes {
+			routes = append(routes, r.toTcpipRoute(nicID))
+		}
+	}
+
+	if !args.DefaultGateway.Route.Empty() {
+		nicID, ok := nicids[args.DefaultGateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+		}
+		routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID))
+	}
+
+	log.Infof("Setting routes %+v", routes)
+	n.Stack.SetRouteTable(routes)
+	return nil
+}
+
+// createNICWithAddrs creates a NIC in the network stack and adds the given
+// addresses.
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP) error {
+	if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
+		return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+	}
+
+	// Always start with an arp address for the NIC.
+	if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+		return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
+	}
+
+	for _, addr := range addrs {
+		proto, tcpipAddr := ipToAddressAndProto(addr)
+		if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+			return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
+		}
+	}
+	return nil
+}
+
+// ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
+//
+// Note: don't use 'len(ip)' to determine IP version because length is always 16.
+func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
+	if i4 := ip.To4(); i4 != nil {
+		return ipv4.ProtocolNumber, tcpip.Address(i4)
+	}
+	return ipv6.ProtocolNumber, tcpip.Address(ip)
+}
+
+// ipToAddress converts IP to tcpip.Address, ignoring the protocol.
+func ipToAddress(ip net.IP) tcpip.Address {
+	_, addr := ipToAddressAndProto(ip)
+	return addr
+}
+
+// generateRndMac returns a random local MAC address.
+// Copied from eth_random_addr() (include/linux/etherdevice.h)
+func generateRndMac() net.HardwareAddr {
+	mac := make(net.HardwareAddr, 6)
+	rand.Read(mac)
+	mac[0] &^= 0x1 // clear multicast bit
+	mac[0] |= 0x2  // set local assignment bit (IEEE802)
+	return mac
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
new file mode 100644
index 000000000..1e898672b
--- /dev/null
+++ b/runsc/boot/strace.go
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+)
+
+func enableStrace(conf *Config) error {
+	// We must initialize even if strace is not enabled.
+	strace.Initialize()
+
+	if !conf.Strace {
+		return nil
+	}
+
+	max := conf.StraceLogSize
+	if max == 0 {
+		max = 1024
+	}
+	strace.LogMaximumSize = max
+
+	if len(conf.StraceSyscalls) == 0 {
+		strace.EnableAll(strace.SinkTypeLog)
+		return nil
+	}
+	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
new file mode 100644
index 000000000..128c8f7e6
--- /dev/null
+++ b/runsc/cmd/BUILD
@@ -0,0 +1,58 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "cmd",
+    srcs = [
+        "boot.go",
+        "cmd.go",
+        "create.go",
+        "delete.go",
+        "events.go",
+        "exec.go",
+        "gofer.go",
+        "kill.go",
+        "list.go",
+        "path.go",
+        "ps.go",
+        "run.go",
+        "start.go",
+        "state.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/cmd",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/unet",
+        "//pkg/urpc",
+        "//runsc/boot",
+        "//runsc/fsgofer",
+        "//runsc/sandbox",
+        "//runsc/specutils",
+        "@com_github_google_subcommands//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "cmd_test",
+    size = "small",
+    srcs = ["exec_test.go"],
+    embed = [":cmd"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/urpc",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
new file mode 100644
index 000000000..0dad6da79
--- /dev/null
+++ b/runsc/cmd/boot.go
@@ -0,0 +1,161 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"os"
+	"runtime"
+	"runtime/debug"
+	"strings"
+	"syscall"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Boot implements subcommands.Command for the "boot" command which starts a
+// new sandbox. It should not be called directly.
+type Boot struct {
+	// bundleDir is the path to the bundle directory.
+	bundleDir string
+
+	// controllerFD is the file descriptor of a stream socket for the
+	// control server that is donated to this process.
+	controllerFD int
+
+	// ioFDs is the list of FDs used to connect to FS gofers.
+	ioFDs intFlags
+
+	// console is set to true if the sandbox should allow terminal ioctl(2)
+	// syscalls.
+	console bool
+
+	// applyCaps determines if capabilities defined in the spec should be applied
+	// to the process.
+	applyCaps bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Boot) Name() string {
+	return "boot"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Boot) Synopsis() string {
+	return "launch a sandbox process (internal use only)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Boot) Usage() string {
+	return `boot [flags]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (b *Boot) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
+	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
+	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
+	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+}
+
+// Execute implements subcommands.Command.Execute.  It starts a sandbox in a
+// waiting state.
+func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if b.bundleDir == "" || b.controllerFD == -1 || f.NArg() != 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	// Ensure that if there is a panic, all goroutine stacks are printed.
+	debug.SetTraceback("all")
+
+	// Get the spec from the bundleDir.
+	spec, err := specutils.ReadSpec(b.bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	// Turn any relative paths in the spec to absolute by prepending the bundleDir.
+	spec.Root.Path = absPath(b.bundleDir, spec.Root.Path)
+	for _, m := range spec.Mounts {
+		if m.Source != "" {
+			m.Source = absPath(b.bundleDir, m.Source)
+		}
+	}
+
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	if b.applyCaps {
+		setCapsAndCallSelf(conf, spec)
+		Fatalf("setCapsAndCallSelf must never return")
+	}
+
+	// Create the loader.
+	s, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console)
+	if err != nil {
+		Fatalf("error creating loader: %v", err)
+	}
+	defer s.Destroy()
+
+	// Wait for the start signal from runsc.
+	s.WaitForStartSignal()
+
+	// Run the application and wait for it to finish.
+	if err := s.Run(); err != nil {
+		Fatalf("error running sandbox: %v", err)
+	}
+
+	ws := s.WaitExit()
+	log.Infof("application exiting with %+v", ws)
+	*waitStatus = syscall.WaitStatus(ws.Status())
+	return subcommands.ExitSuccess
+}
+
+// setCapsAndCallSelf sets capabilities to the current thread and then execve's
+// itself again with the same arguments except '--apply-caps' to restart the
+// whole process with the desired capabilities.
+func setCapsAndCallSelf(conf *boot.Config, spec *specs.Spec) {
+	// Keep thread locked while capabilities are changed.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if err := boot.ApplyCaps(conf, spec.Process.Capabilities); err != nil {
+		Fatalf("ApplyCaps, err: %v", err)
+	}
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	// Remove --apply-caps arg to call myself.
+	var args []string
+	for _, arg := range os.Args {
+		if !strings.Contains(arg, "apply-caps") {
+			args = append(args, arg)
+		}
+	}
+
+	log.Infof("Execve 'boot' again, bye!")
+	log.Infof("%s %v", binPath, args)
+	syscall.Exec(binPath, args, []string{})
+}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
new file mode 100644
index 000000000..d4b834213
--- /dev/null
+++ b/runsc/cmd/cmd.go
@@ -0,0 +1,77 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cmd holds implementations of the runsc commands.
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+
+	"flag"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// Fatalf logs to stderr and exits with a failure status code.
+func Fatalf(s string, args ...interface{}) {
+	// If runsc is being invoked by docker or cri-o, then we might not have
+	// access to stderr, so we log a serious-looking warning in addition to
+	// writing to stderr.
+	log.Warningf("FATAL ERROR: "+s, args...)
+	fmt.Fprintf(os.Stderr, s+"\n", args...)
+	// Return an error that is unlikely to be used by the application.
+	os.Exit(128)
+}
+
+// commandLineFlags returns a slice of all top-level command line flags that
+// have been set.
+func commandLineFlags() []string {
+	var args []string
+	flag.CommandLine.Visit(func(f *flag.Flag) {
+		args = append(args, fmt.Sprintf("--%s=%s", f.Name, f.Value.String()))
+	})
+	return args
+}
+
+// intFlags can be used with int flags that appear multiple times.
+type intFlags []int
+
+// String implements flag.Value.
+func (i *intFlags) String() string {
+	return fmt.Sprintf("%v", *i)
+}
+
+// Get implements flag.Value.
+func (i *intFlags) Get() interface{} {
+	return i
+}
+
+// GetArray returns array of FDs.
+func (i *intFlags) GetArray() []int {
+	return *i
+}
+
+// Set implements flag.Value.
+func (i *intFlags) Set(s string) error {
+	fd, err := strconv.Atoi(s)
+	if err != nil {
+		return fmt.Errorf("invalid flag value: %v", err)
+	}
+	if fd < 0 {
+		return fmt.Errorf("flag value must be greater than 0: %d", fd)
+	}
+	*i = append(*i, fd)
+	return nil
+}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
new file mode 100644
index 000000000..83cb09eb0
--- /dev/null
+++ b/runsc/cmd/create.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Create implements subcommands.Command for the "create" command.
+type Create struct {
+	// bundleDir is the path to the bundle directory (defaults to the
+	// current working directory).
+	bundleDir string
+
+	// pidFile is the filename that the sandbox pid will be written to.
+	// This file should only be created once the sandbox process is ready
+	// to use (i.e. control server has started and is listening).
+	pidFile string
+
+	// consoleSocket is the path to an AF_UNIX socket which will receive a
+	// file descriptor referencing the master end of the console's
+	// pseudoterminal.  This is ignored unless spec.Process.Terminal is
+	// true.
+	consoleSocket string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Create) Name() string {
+	return "create"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Create) Synopsis() string {
+	return "create a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Create) Usage() string {
+	return `create [flags] <container id> - create a secure container
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Create) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+	f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+	f.StringVar(&c.pidFile, "pid-file", "", "filename that the sandbox pid will be written to")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	bundleDir := c.bundleDir
+	if bundleDir == "" {
+		bundleDir = getwdOrDie()
+	}
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	// Create the sandbox process, passing additional command line
+	// arguments to the sandbox process.
+	if _, err := sandbox.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, commandLineFlags()); err != nil {
+		Fatalf("error creating sandbox: %v", err)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
new file mode 100644
index 000000000..a497c034d
--- /dev/null
+++ b/runsc/cmd/delete.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Delete implements subcommands.Command for the "delete" command.
+type Delete struct {
+	// force indicates that the sandbox should be terminated if running.
+	force bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Delete) Name() string {
+	return "delete"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Delete) Synopsis() string {
+	return "delete resources held by a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Delete) Usage() string {
+	return `delete [flags] <container ids>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (d *Delete) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&d.force, "force", false, "terminate sandbox if running")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() == 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	conf := args[0].(*boot.Config)
+
+	for i := 0; i < f.NArg(); i++ {
+		id := f.Arg(i)
+		s, err := sandbox.Load(conf.RootDir, id)
+		if err != nil {
+			Fatalf("error loading sandbox %q: %v", id, err)
+		}
+		if !d.force && (s.Status == sandbox.Running) {
+			Fatalf("cannot stop running sandbox without --force flag")
+		}
+		if err := s.Destroy(); err != nil {
+			Fatalf("error destroying sandbox: %v", err)
+		}
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
new file mode 100644
index 000000000..afd42c2f2
--- /dev/null
+++ b/runsc/cmd/events.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"os"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Events implements subcommands.Command for the "events" command.
+type Events struct {
+	// The interval between stats reporting.
+	intervalSec int
+	// If true, events will print a single group of stats and exit.
+	stats bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Events) Name() string {
+	return "events"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Events) Synopsis() string {
+	return "display container events such as OOM notifications, cpu, memory, and IO usage statistics"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Events) Usage() string {
+	return `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+The events command displays information about the container. By default the
+information is displayed once every 5 seconds.
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (evs *Events) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds")
+	f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandox: %v", err)
+	}
+
+	// Repeatedly get stats from the container.
+	for {
+		// Get the event and print it as JSON.
+		ev, err := s.Event()
+		if err != nil {
+			log.Warningf("error getting events for sandbox: %v", err)
+		}
+		// err must be preserved because it is used below when breaking
+		// out of the loop.
+		b, err := json.Marshal(ev)
+		if err != nil {
+			log.Warningf("error while marshalling event %v: %v", ev, err)
+		} else {
+			os.Stdout.Write(b)
+		}
+
+		// If we're only running once, break. If we're only running
+		// once and there was an error, the command failed.
+		if evs.stats {
+			if err != nil {
+				return subcommands.ExitFailure
+			}
+			break
+		}
+
+		time.Sleep(time.Duration(evs.intervalSec) * time.Second)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
new file mode 100644
index 000000000..8379f552d
--- /dev/null
+++ b/runsc/cmd/exec.go
@@ -0,0 +1,375 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Exec implements subcommands.Command for the "exec" command.
+type Exec struct {
+	cwd string
+	env stringSlice
+	// user contains the UID and GID with which to run the new process.
+	user        user
+	extraKGIDs  stringSlice
+	caps        stringSlice
+	detach      bool
+	processPath string
+	pidFile     string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Exec) Name() string {
+	return "exec"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Exec) Synopsis() string {
+	return "execute new process inside the container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Exec) Usage() string {
+	return `exec [command options] <container-id> <command> [command options] || --process process.json <container-id>
+
+
+Where "<container-id>" is the name for the instance of the container and
+"<command>" is the command to be executed in the container.
+"<command>" can't be empty unless a "-process" flag provided.
+
+EXAMPLE:
+If the container is configured to run /bin/ps the following will
+output a list of processes running in the container:
+
+       # runc exec <container-id> ps
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ex *Exec) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&ex.cwd, "cwd", "", "current working directory")
+	f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')")
+	f.Var(&ex.user, "user", "UID (format: <uid>[:<gid>])")
+	f.Var(&ex.extraKGIDs, "additional-gids", "additional gids")
+	f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process")
+	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
+	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
+	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the sandbox pid will be written to")
+}
+
+// Execute implements subcommands.Command.Execute. It starts a process in an
+// already created sandbox.
+func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	e, id, err := ex.parseArgs(f)
+	if err != nil {
+		Fatalf("error parsing process spec: %v", err)
+	}
+	e.Detach = ex.detach
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandox: %v", err)
+	}
+
+	if e.WorkingDirectory == "" {
+		e.WorkingDirectory = s.Spec.Process.Cwd
+	}
+
+	if e.Envv == nil {
+		e.Envv, err = resolveEnvs(s.Spec.Process.Env, ex.env)
+		if err != nil {
+			Fatalf("error getting environment variables: %v", err)
+		}
+	}
+
+	// containerd expects an actual process to represent the container being
+	// executed. If detach was specified, starts a child in non-detach mode,
+	// write the child's PID to the pid file. So when the container returns, the
+	// child process will also return and signal containerd.
+	if e.Detach {
+		binPath, err := specutils.BinPath()
+		if err != nil {
+			Fatalf("error getting bin path: %v", err)
+		}
+		var args []string
+		for _, a := range os.Args[1:] {
+			if !strings.Contains(a, "detach") {
+				args = append(args, a)
+			}
+		}
+		cmd := exec.Command(binPath, args...)
+		cmd.Stdin = os.Stdin
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		if err := cmd.Start(); err != nil {
+			Fatalf("failure to start child exec process, err: %v", err)
+		}
+
+		log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+
+		// Wait for PID file to ensure that child process has started. Otherwise,
+		// '--process' file is deleted as soon as this process returns and the child
+		// may fail to read it.
+		sleepTime := 10 * time.Millisecond
+		for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
+			_, err := os.Stat(ex.pidFile)
+			if err == nil {
+				break
+			}
+			if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
+				Fatalf("unexpected error waiting for PID file, err: %v", err)
+			}
+
+			log.Infof("Waiting for PID file to be created...")
+			time.Sleep(sleepTime)
+			sleepTime *= sleepTime * 2
+			if sleepTime > 1*time.Second {
+				sleepTime = 1 * time.Second
+			}
+		}
+		*waitStatus = 0
+		return subcommands.ExitSuccess
+	}
+
+	if ex.pidFile != "" {
+		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
+			Fatalf("error writing pid file: %v", err)
+		}
+	}
+
+	// Get the executable path, which is a bit tricky because we have to
+	// inspect the environment PATH which is relative to the root path.
+	// If the user is overriding environment variables, PATH may have been
+	// overwritten.
+	rootPath := s.Spec.Root.Path
+	e.Filename, err = specutils.GetExecutablePath(e.Argv[0], rootPath, e.Envv)
+	if err != nil {
+		Fatalf("error getting executable path: %v", err)
+	}
+
+	ws, err := s.Execute(e)
+	if err != nil {
+		Fatalf("error getting processes for sandbox: %v", err)
+	}
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
+
+// parseArgs parses exec information from the command line or a JSON file
+// depending on whether the --process flag was used. Returns an ExecArgs and
+// the ID of the sandbox to be used.
+func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
+	if ex.processPath == "" {
+		// Requires at least a container ID and command.
+		if f.NArg() < 2 {
+			f.Usage()
+			return nil, "", fmt.Errorf("both a container-id and command are required")
+		}
+		e, err := ex.argsFromCLI(f.Args()[1:])
+		return e, f.Arg(0), err
+	}
+	// Requires only the container ID.
+	if f.NArg() != 1 {
+		f.Usage()
+		return nil, "", fmt.Errorf("a container-id is required")
+	}
+	e, err := ex.argsFromProcessFile()
+	return e, f.Arg(0), err
+}
+
+func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
+	extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
+	for _, s := range ex.extraKGIDs {
+		kgid, err := strconv.Atoi(s)
+		if err != nil {
+			Fatalf("error parsing GID: %s, %v", s, err)
+		}
+		extraKGIDs = append(extraKGIDs, auth.KGID(kgid))
+	}
+
+	caps, err := capabilities(ex.caps)
+	if err != nil {
+		return nil, fmt.Errorf("capabilities error: %v", err)
+	}
+
+	return &control.ExecArgs{
+		Argv:             argv,
+		WorkingDirectory: ex.cwd,
+		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+		KUID:             ex.user.kuid,
+		KGID:             ex.user.kgid,
+		ExtraKGIDs:       extraKGIDs,
+		Capabilities:     caps,
+	}, nil
+}
+
+func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
+	f, err := os.Open(ex.processPath)
+	if err != nil {
+		return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
+	}
+	defer f.Close()
+	var p specs.Process
+	if err := json.NewDecoder(f).Decode(&p); err != nil {
+		return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
+	}
+	return argsFromProcess(&p)
+}
+
+// argsFromProcess performs all the non-IO conversion from the Process struct
+// to ExecArgs.
+func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
+	// Create capabilities.
+	caps, err := specutils.Capabilities(p.Capabilities)
+	if err != nil {
+		return nil, fmt.Errorf("error creating capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids))
+	for _, GID := range p.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	return &control.ExecArgs{
+		Argv:             p.Args,
+		Envv:             p.Env,
+		WorkingDirectory: p.Cwd,
+		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+		KUID:             auth.KUID(p.User.UID),
+		KGID:             auth.KGID(p.User.GID),
+		ExtraKGIDs:       extraKGIDs,
+		Capabilities:     caps,
+	}, nil
+}
+
+// resolveEnvs transforms lists of environment variables into a single list of
+// environment variables. If a variable is defined multiple times, the last
+// value is used.
+func resolveEnvs(envs ...[]string) ([]string, error) {
+	// First create a map of variable names to values. This removes any
+	// duplicates.
+	envMap := make(map[string]string)
+	for _, env := range envs {
+		for _, str := range env {
+			parts := strings.SplitN(str, "=", 2)
+			if len(parts) != 2 {
+				return nil, fmt.Errorf("invalid variable: %s", str)
+			}
+			envMap[parts[0]] = parts[1]
+		}
+	}
+	// Reassemble envMap into a list of environment variables of the form
+	// NAME=VALUE.
+	env := make([]string, 0, len(envMap))
+	for k, v := range envMap {
+		env = append(env, fmt.Sprintf("%s=%s", k, v))
+	}
+	return env, nil
+}
+
+// capabilities takes a list of capabilities as strings and returns an
+// auth.TaskCapabilities struct with those capabilities in every capability set.
+// This mimics runc's behavior.
+func capabilities(cs []string) (*auth.TaskCapabilities, error) {
+	var specCaps specs.LinuxCapabilities
+	for _, cap := range cs {
+		specCaps.Ambient = append(specCaps.Ambient, cap)
+		specCaps.Bounding = append(specCaps.Bounding, cap)
+		specCaps.Effective = append(specCaps.Effective, cap)
+		specCaps.Inheritable = append(specCaps.Inheritable, cap)
+		specCaps.Permitted = append(specCaps.Permitted, cap)
+	}
+	return specutils.Capabilities(&specCaps)
+}
+
+// stringSlice allows a flag to be used multiple times, where each occurrence
+// adds a value to the flag. For example, a flag called "x" could be invoked
+// via "runsc exec -x foo -x bar", and the corresponding stringSlice would be
+// {"x", "y"}.
+type stringSlice []string
+
+// String implements flag.Value.String.
+func (ss *stringSlice) String() string {
+	return fmt.Sprintf("%v", *ss)
+}
+
+// Get implements flag.Value.Get.
+func (ss *stringSlice) Get() interface{} {
+	return ss
+}
+
+// Set implements flag.Value.Set.
+func (ss *stringSlice) Set(s string) error {
+	*ss = append(*ss, s)
+	return nil
+}
+
+// user allows -user to convey a UID and, optionally, a GID separated by a
+// colon.
+type user struct {
+	kuid auth.KUID
+	kgid auth.KGID
+}
+
+func (u *user) String() string {
+	return fmt.Sprintf("%+v", *u)
+}
+
+func (u *user) Get() interface{} {
+	return u
+}
+
+func (u *user) Set(s string) error {
+	parts := strings.SplitN(s, ":", 2)
+	kuid, err := strconv.Atoi(parts[0])
+	if err != nil {
+		return fmt.Errorf("couldn't parse UID: %s", parts[0])
+	}
+	u.kuid = auth.KUID(kuid)
+	if len(parts) > 1 {
+		kgid, err := strconv.Atoi(parts[1])
+		if err != nil {
+			return fmt.Errorf("couldn't parse GID: %s", parts[1])
+		}
+		u.kgid = auth.KGID(kgid)
+	}
+	return nil
+}
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
new file mode 100644
index 000000000..623461e78
--- /dev/null
+++ b/runsc/cmd/exec_test.go
@@ -0,0 +1,154 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"os"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+)
+
+func TestUser(t *testing.T) {
+	testCases := []struct {
+		input   string
+		want    user
+		wantErr bool
+	}{
+		{input: "0", want: user{kuid: 0, kgid: 0}},
+		{input: "7", want: user{kuid: 7, kgid: 0}},
+		{input: "49:343", want: user{kuid: 49, kgid: 343}},
+		{input: "0:2401", want: user{kuid: 0, kgid: 2401}},
+		{input: "", wantErr: true},
+		{input: "foo", wantErr: true},
+		{input: ":123", wantErr: true},
+		{input: "1:2:3", wantErr: true},
+	}
+
+	for _, tc := range testCases {
+		var u user
+		if err := u.Set(tc.input); err != nil && tc.wantErr {
+			// We got an error and wanted one.
+			continue
+		} else if err == nil && tc.wantErr {
+			t.Errorf("user.Set(%s): got no error, but wanted one", tc.input)
+		} else if err != nil && !tc.wantErr {
+			t.Errorf("user.Set(%s): got error %v, but wanted none", tc.input, err)
+		} else if u != tc.want {
+			t.Errorf("user.Set(%s): got %+v, but wanted %+v", tc.input, u, tc.want)
+		}
+	}
+}
+
+func TestCLIArgs(t *testing.T) {
+	testCases := []struct {
+		ex       Exec
+		argv     []string
+		expected control.ExecArgs
+	}{
+		{
+			ex: Exec{
+				cwd:         "/foo/bar",
+				user:        user{kuid: 0, kgid: 0},
+				extraKGIDs:  []string{"1", "2", "3"},
+				caps:        []string{"CAP_DAC_OVERRIDE"},
+				processPath: "",
+			},
+			argv: []string{"ls", "/"},
+			expected: control.ExecArgs{
+				Argv:             []string{"ls", "/"},
+				WorkingDirectory: "/foo/bar",
+				FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+				KUID:             0,
+				KGID:             0,
+				ExtraKGIDs:       []auth.KGID{1, 2, 3},
+				Capabilities: &auth.TaskCapabilities{
+					BoundingCaps:    auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					EffectiveCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					PermittedCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		e, err := tc.ex.argsFromCLI(tc.argv)
+		if err != nil {
+			t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err)
+		} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
+			t.Errorf("argsFromCLI(%+v): got %+v, but expected %+v", tc.ex, *e, tc.expected)
+		}
+	}
+}
+
+func TestJSONArgs(t *testing.T) {
+	testCases := []struct {
+		// ex is provided to make sure it is overridden by p.
+		ex       Exec
+		p        specs.Process
+		expected control.ExecArgs
+	}{
+		{
+			ex: Exec{
+				cwd:         "/baz/quux",
+				user:        user{kuid: 1, kgid: 1},
+				extraKGIDs:  []string{"4", "5", "6"},
+				caps:        []string{"CAP_SETGID"},
+				processPath: "/bin/foo",
+			},
+			p: specs.Process{
+				User: specs.User{UID: 0, GID: 0, AdditionalGids: []uint32{1, 2, 3}},
+				Args: []string{"ls", "/"},
+				Cwd:  "/foo/bar",
+				Capabilities: &specs.LinuxCapabilities{
+					Bounding:    []string{"CAP_DAC_OVERRIDE"},
+					Effective:   []string{"CAP_DAC_OVERRIDE"},
+					Inheritable: []string{"CAP_DAC_OVERRIDE"},
+					Permitted:   []string{"CAP_DAC_OVERRIDE"},
+				},
+			},
+			expected: control.ExecArgs{
+				Argv:             []string{"ls", "/"},
+				WorkingDirectory: "/foo/bar",
+				FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+				KUID:             0,
+				KGID:             0,
+				ExtraKGIDs:       []auth.KGID{1, 2, 3},
+				Capabilities: &auth.TaskCapabilities{
+					BoundingCaps:    auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					EffectiveCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					PermittedCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		e, err := argsFromProcess(&tc.p)
+		if err != nil {
+			t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err)
+		} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
+			t.Errorf("argsFromProcess(%+v): got %+v, but expected %+v", tc.p, *e, tc.expected)
+		}
+	}
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
new file mode 100644
index 000000000..844e16dbf
--- /dev/null
+++ b/runsc/cmd/gofer.go
@@ -0,0 +1,134 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"sync"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/fsgofer"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Gofer implements subcommands.Command for the "gofer" command, which starts a
+// filesystem gofer.  This command should not be called directly.
+type Gofer struct {
+	bundleDir string
+	ioFDs     intFlags
+}
+
+// Name implements subcommands.Command.
+func (*Gofer) Name() string {
+	return "gofer"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Gofer) Synopsis() string {
+	return "launch a gofer process that server files over 9P protocol (internal use only)"
+}
+
+// Usage implements subcommands.Command.
+func (*Gofer) Usage() string {
+	return `gofer [flags]`
+}
+
+// SetFlags implements subcommands.Command.
+func (g *Gofer) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
+}
+
+// Execute implements subcommands.Command.
+func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if g.bundleDir == "" || len(g.ioFDs) < 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	spec, err := specutils.ReadSpec(g.bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	// Start with root mount, then add any other addition mount as needed.
+	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
+	p := absPath(g.bundleDir, spec.Root.Path)
+	ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
+		ROMount: spec.Root.Readonly,
+		// Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when
+		// each file is opened as writable. Thus, we open files lazily to avoid copy-up.
+		LazyOpenForWrite: true,
+	}))
+	log.Infof("Serving %q mapped to %q on FD %d", "/", p, g.ioFDs[0])
+
+	mountIdx := 1 // first one is the root
+	for _, m := range spec.Mounts {
+		if specutils.Is9PMount(m) {
+			p = absPath(g.bundleDir, m.Source)
+			ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{
+				ROMount:          isReadonlyMount(m.Options),
+				LazyOpenForWrite: false,
+			}))
+
+			if mountIdx >= len(g.ioFDs) {
+				Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
+			}
+			log.Infof("Serving %q mapped to %q on FD %d", m.Destination, p, g.ioFDs[mountIdx])
+			mountIdx++
+		}
+	}
+	if mountIdx != len(g.ioFDs) {
+		Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
+	}
+
+	runServers(ats, g.ioFDs)
+	return subcommands.ExitSuccess
+}
+
+func runServers(ats []p9.Attacher, ioFDs []int) {
+	// Run the loops and wait for all to exit.
+	var wg sync.WaitGroup
+	for i, ioFD := range ioFDs {
+		wg.Add(1)
+		go func(ioFD int, at p9.Attacher) {
+			socket, err := unet.NewSocket(ioFD)
+			if err != nil {
+				Fatalf("err creating server on FD %d: %v", ioFD, err)
+			}
+			s := p9.NewServer(at)
+			if err := s.Handle(socket); err != nil {
+				Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
+			}
+			wg.Done()
+		}(ioFD, ats[i])
+	}
+	wg.Wait()
+	log.Infof("All 9P servers exited.")
+}
+
+func isReadonlyMount(opts []string) bool {
+	for _, o := range opts {
+		if o == "ro" {
+			return true
+		}
+	}
+	return false
+}
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
new file mode 100644
index 000000000..f89e0077e
--- /dev/null
+++ b/runsc/cmd/kill.go
@@ -0,0 +1,142 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"syscall"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Kill implements subcommands.Command for the "kill" command.
+type Kill struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Kill) Name() string {
+	return "kill"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Kill) Synopsis() string {
+	return "sends a signal to the sandbox"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Kill) Usage() string {
+	return `kill <container id> [signal]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Kill) SetFlags(f *flag.FlagSet) {
+	// TODO: Implement this flag.  It is defined here just to
+	// prevent runsc from crashing if it is passed.
+	var all bool
+	f.BoolVar(&all, "all", false, "send the specified signal to all processes inside the container")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() == 0 || f.NArg() > 2 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandbox: %v", err)
+	}
+
+	// The OCI command-line spec says that the signal should be specified
+	// via a flag, but runc (and things that call runc) pass it as an
+	// argument.
+	signal := f.Arg(2)
+	if signal == "" {
+		signal = "TERM"
+	}
+
+	sig, err := parseSignal(signal)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+	if err := s.Signal(sig); err != nil {
+		Fatalf("%v", err)
+	}
+	return subcommands.ExitSuccess
+}
+
+func parseSignal(s string) (syscall.Signal, error) {
+	n, err := strconv.Atoi(s)
+	if err == nil {
+		sig := syscall.Signal(n)
+		for _, msig := range signalMap {
+			if sig == msig {
+				return sig, nil
+			}
+		}
+		return -1, fmt.Errorf("unknown signal %q", s)
+	}
+	if sig, ok := signalMap[strings.TrimPrefix(strings.ToUpper(s), "SIG")]; ok {
+		return sig, nil
+	}
+	return -1, fmt.Errorf("unknown signal %q", s)
+}
+
+var signalMap = map[string]syscall.Signal{
+	"ABRT":   unix.SIGABRT,
+	"ALRM":   unix.SIGALRM,
+	"BUS":    unix.SIGBUS,
+	"CHLD":   unix.SIGCHLD,
+	"CLD":    unix.SIGCLD,
+	"CONT":   unix.SIGCONT,
+	"FPE":    unix.SIGFPE,
+	"HUP":    unix.SIGHUP,
+	"ILL":    unix.SIGILL,
+	"INT":    unix.SIGINT,
+	"IO":     unix.SIGIO,
+	"IOT":    unix.SIGIOT,
+	"KILL":   unix.SIGKILL,
+	"PIPE":   unix.SIGPIPE,
+	"POLL":   unix.SIGPOLL,
+	"PROF":   unix.SIGPROF,
+	"PWR":    unix.SIGPWR,
+	"QUIT":   unix.SIGQUIT,
+	"SEGV":   unix.SIGSEGV,
+	"STKFLT": unix.SIGSTKFLT,
+	"STOP":   unix.SIGSTOP,
+	"SYS":    unix.SIGSYS,
+	"TERM":   unix.SIGTERM,
+	"TRAP":   unix.SIGTRAP,
+	"TSTP":   unix.SIGTSTP,
+	"TTIN":   unix.SIGTTIN,
+	"TTOU":   unix.SIGTTOU,
+	"URG":    unix.SIGURG,
+	"USR1":   unix.SIGUSR1,
+	"USR2":   unix.SIGUSR2,
+	"VTALRM": unix.SIGVTALRM,
+	"WINCH":  unix.SIGWINCH,
+	"XCPU":   unix.SIGXCPU,
+	"XFSZ":   unix.SIGXFSZ,
+}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
new file mode 100644
index 000000000..bf7cb41bb
--- /dev/null
+++ b/runsc/cmd/list.go
@@ -0,0 +1,117 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"text/tabwriter"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// List implements subcommands.Command for the "list" command for the "list" command.
+type List struct {
+	quiet  bool
+	format string
+}
+
+// Name implements subcommands.command.name.
+func (*List) Name() string {
+	return "list"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*List) Synopsis() string {
+	return "list contaners started by runsc with the given root"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*List) Usage() string {
+	return `list [flags]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (l *List) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&l.quiet, "quiet", false, "only list container ids")
+	f.StringVar(&l.format, "format", "text", "output format: 'text' (default) or 'json'")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	conf := args[0].(*boot.Config)
+	ids, err := sandbox.List(conf.RootDir)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	if l.quiet {
+		for _, id := range ids {
+			fmt.Println(id)
+		}
+		return subcommands.ExitSuccess
+	}
+
+	// Collect the sandboxes.
+	var sandboxes []*sandbox.Sandbox
+	for _, id := range ids {
+		s, err := sandbox.Load(conf.RootDir, id)
+		if err != nil {
+			Fatalf("error loading sandbox %q: %v", id, err)
+		}
+		sandboxes = append(sandboxes, s)
+	}
+
+	switch l.format {
+	case "text":
+		// Print a nice table.
+		w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
+		fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
+		for _, s := range sandboxes {
+			fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
+				s.ID,
+				s.Pid,
+				s.Status,
+				s.BundleDir,
+				s.CreatedAt.Format(time.RFC3339Nano),
+				s.Owner)
+		}
+		w.Flush()
+	case "json":
+		// Print just the states.
+		var states []specs.State
+		for _, s := range sandboxes {
+			states = append(states, s.State())
+		}
+		if err := json.NewEncoder(os.Stdout).Encode(states); err != nil {
+			Fatalf("error marshaling sandbox state: %v", err)
+		}
+	default:
+		Fatalf("unknown list format %q", l.format)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
new file mode 100644
index 000000000..4bb1dbb4f
--- /dev/null
+++ b/runsc/cmd/path.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"os"
+	"path/filepath"
+)
+
+// absPath turns the given path into an absolute path (if it is not already
+// absolute) by prepending the base path.
+func absPath(base, rel string) string {
+	if filepath.IsAbs(rel) {
+		return rel
+	}
+	return filepath.Join(base, rel)
+}
+
+// getwdOrDie returns the current working directory and dies if it cannot.
+func getwdOrDie() string {
+	wd, err := os.Getwd()
+	if err != nil {
+		Fatalf("error getting current working directory: %v", err)
+	}
+	return wd
+}
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
new file mode 100644
index 000000000..a667ec04c
--- /dev/null
+++ b/runsc/cmd/ps.go
@@ -0,0 +1,86 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// PS implements subcommands.Command for the "ps" command.
+type PS struct {
+	format string
+}
+
+// Name implements subcommands.Command.Name.
+func (*PS) Name() string {
+	return "ps"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*PS) Synopsis() string {
+	return "ps displays the processes running inside a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*PS) Usage() string {
+	return "<container-id> [ps options]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ps *PS) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&ps.format, "format", "table", "output format. Select one of: table or json (default: table)")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandox: %v", err)
+	}
+	pList, err := s.Processes()
+	if err != nil {
+		Fatalf("error getting processes for sandbox: %v", err)
+	}
+
+	switch ps.format {
+	case "table":
+		fmt.Println(control.ProcessListToTable(pList))
+	case "json":
+		o, err := control.PrintPIDsJSON(pList)
+		if err != nil {
+			Fatalf("error generating JSON: %v", err)
+		}
+		fmt.Println(o)
+	default:
+		Fatalf("Unsupported format: %s", ps.format)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
new file mode 100644
index 000000000..a61a6c73e
--- /dev/null
+++ b/runsc/cmd/run.go
@@ -0,0 +1,82 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"syscall"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// Run implements subcommands.Command for the "run" command.
+type Run struct {
+	// Run flags are a super-set of those for Create.
+	Create
+}
+
+// Name implements subcommands.Command.Name.
+func (*Run) Name() string {
+	return "run"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Run) Synopsis() string {
+	return "create and run a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Run) Usage() string {
+	return `run [flags] <container id> - create and run a secure container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Run) SetFlags(f *flag.FlagSet) {
+	r.Create.SetFlags(f)
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	bundleDir := r.bundleDir
+	if bundleDir == "" {
+		bundleDir = getwdOrDie()
+	}
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		Fatalf("error reading spec: %v", err)
+	}
+
+	ws, err := sandbox.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, commandLineFlags())
+	if err != nil {
+		Fatalf("error running sandbox: %v", err)
+	}
+
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
new file mode 100644
index 000000000..a8e132497
--- /dev/null
+++ b/runsc/cmd/start.go
@@ -0,0 +1,64 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// Start implements subcommands.Command for the "start" command.
+type Start struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Start) Name() string {
+	return "start"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Start) Synopsis() string {
+	return "start a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Start) Usage() string {
+	return `start <container id> - start a secure container.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Start) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandbox: %v", err)
+	}
+	if err := s.Start(conf); err != nil {
+		Fatalf("error starting sandbox: %v", err)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
new file mode 100644
index 000000000..0b47f290a
--- /dev/null
+++ b/runsc/cmd/state.go
@@ -0,0 +1,73 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"os"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+// State implements subcommands.Command for the "state" command.
+type State struct{}
+
+// Name implements subcommands.Command.Name.
+func (*State) Name() string {
+	return "state"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*State) Synopsis() string {
+	return "get the state of a sandbox"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*State) Usage() string {
+	return `state [flags] <container id> - get the state of a sandbox`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*State) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	s, err := sandbox.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("error loading sandbox: %v", err)
+	}
+	log.Debugf("Returning state %+v", s)
+
+	// Write json-encoded state directly to stdout.
+	b, err := json.MarshalIndent(s.State(), "", "  ")
+	if err != nil {
+		Fatalf("error marshaling sandbox state: %v", err)
+	}
+	os.Stdout.Write(b)
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
new file mode 100644
index 000000000..24e172f48
--- /dev/null
+++ b/runsc/fsgofer/BUILD
@@ -0,0 +1,33 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "fsgofer",
+    srcs = [
+        "fsgofer.go",
+        "fsgofer_unsafe.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/fsgofer",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/p9",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "fsgofer_test",
+    size = "small",
+    srcs = ["fsgofer_test.go"],
+    embed = [":fsgofer"],
+    deps = [
+        "//pkg/log",
+        "//pkg/p9",
+    ],
+)
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
new file mode 100644
index 000000000..5ddc75a9d
--- /dev/null
+++ b/runsc/fsgofer/fsgofer.go
@@ -0,0 +1,937 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsgofer implements p9.File giving access to local files using
+// a simple mapping from a path prefix that is added to the path requested
+// by the sandbox. Ex:
+//
+//   prefix: "/docker/imgs/alpine"
+//   app path: /bin/ls => /docker/imgs/alpine/bin/ls
+package fsgofer
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+	"sync"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/fd"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+)
+
+const (
+	// invalidMode is set to a value that doesn't match any other valid
+	// modes to ensure an unopened/closed file fails all mode checks.
+	invalidMode = p9.OpenFlags(math.MaxUint32)
+
+	openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
+)
+
+type fileType int
+
+const (
+	regular fileType = iota
+	directory
+	symlink
+)
+
+// String implements fmt.Stringer.
+func (f fileType) String() string {
+	switch f {
+	case regular:
+		return "regular"
+	case directory:
+		return "directory"
+	case symlink:
+		return "symlink"
+	}
+	return "unknown"
+}
+
+// Config sets configuration options for each attach point.
+type Config struct {
+	// ROMount is set to true if this is a readonly mount.
+	ROMount bool
+
+	// LazyOpenForWrite makes the underlying file to be opened in RDONLY
+	// mode initially and be reopened in case write access is desired.
+	// This is done to workaround the behavior in 'overlay2' that
+	// copies the entire file up eagerly when it's opened in write mode
+	// even if the file is never actually written to.
+	LazyOpenForWrite bool
+}
+
+type attachPoint struct {
+	prefix string
+	conf   Config
+}
+
+// NewAttachPoint creates a new attacher that gives local file
+// access to all files under 'prefix'.
+func NewAttachPoint(prefix string, c Config) p9.Attacher {
+	return &attachPoint{prefix: prefix, conf: c}
+}
+
+// Attach implements p9.Attacher.
+func (a *attachPoint) Attach(appPath string) (p9.File, error) {
+	if !path.IsAbs(appPath) {
+		return nil, fmt.Errorf("invalid path %q", appPath)
+	}
+
+	root := filepath.Join(a.prefix, appPath)
+	f, err := os.OpenFile(root, openFlags|syscall.O_RDONLY, 0)
+	if err != nil {
+		return nil, fmt.Errorf("unable to open file %q, err: %v", root, err)
+	}
+	stat, err := stat(int(f.Fd()))
+	if err != nil {
+		return nil, fmt.Errorf("failed to stat file %q, err: %v", root, err)
+	}
+	return newLocalFile(a.conf, f, root, stat)
+}
+
+func makeQID(stat syscall.Stat_t) p9.QID {
+	return p9.QID{
+		Type: p9.FileMode(stat.Mode).QIDType(),
+		Path: stat.Ino,
+	}
+}
+
+func isNameValid(name string) bool {
+	if name == "" || name == "." || name == ".." {
+		log.Warningf("Invalid name: %s", name)
+		return false
+	}
+	if strings.IndexByte(name, '/') >= 0 {
+		log.Warningf("Invalid name: %s", name)
+		return false
+	}
+	return true
+}
+
+// localFile implements p9.File wrapping a local file. The underlying file
+// is opened during Walk() and stored in 'controlFile' to be used with other
+// operations. The mode in which the file is opened varies depending on the
+// configuration (see below). 'controlFile' is dup'ed when Walk(nil) is called
+// to clone the file.
+//
+// 'openedFile' is assigned when Open() is called. If requested open mode is
+// a subset of controlFile's mode, it's possible to use the same file. If mode
+// is not a subset, then another file is opened. Consequently, 'openedFile'
+// could have a mode wider than requested and must be verified before read/write
+// operations. Before the file is opened and after it's closed, 'mode' is set to
+// an invalid value to prevent an unopened file from being used.
+//
+// localFile has 2 modes of operation based on the configuration:
+//
+// ** conf.lazyRWOpen == false **
+// This is the preferred mode. 'controlFile' is opened in RW mode in Walk()
+// and used across all functions. The file is never reopened as the mode will
+// always be a super set of the requested open mode. This reduces the number of
+// syscalls required per operation and makes it resilient to renames anywhere
+// in the path to the file.
+//
+// ** conf.lazyRWOpen == true **
+// This mode is used for better performance with 'overlay2' storage driver.
+// overlay2 eagerly copies the entire file up when it's opened in write mode
+// which makes the mode above perform badly when serveral of files are opened
+// for read (esp. startup). In this mode, 'controlFile' is opened as readonly
+// (or O_PATH for symlinks). Reopening the file is required if write mode
+// is requested in Open().
+type localFile struct {
+	p9.DefaultWalkGetAttr
+
+	// mu protects 'hostPath' when file is renamed.
+	mu sync.Mutex
+
+	// TODO: hostPath is not safe to use as path needs to be walked
+	// everytime (and can change underneath us). Remove all usages.
+	hostPath string
+
+	// controlFile is opened when localFile is created and it's never nil.
+	controlFile *os.File
+
+	// openedFile is nil until localFile is opened. It may point to controlFile
+	// or be a new file struct. See struct comment for more details.
+	openedFile *os.File
+
+	// mode is the mode in which the file was opened. Set to invalidMode
+	// if localFile isn't opened.
+	mode p9.OpenFlags
+
+	ft fileType
+
+	conf Config
+
+	// readDirMu protects against concurrent Readdir calls.
+	readDirMu sync.Mutex
+}
+
+func openAnyFile(parent *localFile, name string) (*os.File, string, error) {
+	// Attempt to open file in the following mode in order:
+	//   1. RDWR: for files with rw mounts and LazyOpenForWrite disabled
+	//   2. RDONLY: for directories, ro mounts or LazyOpenForWrite enabled
+	//   3. PATH: for symlinks
+	modes := []int{syscall.O_RDWR, syscall.O_RDONLY, unix.O_PATH}
+	symlinkIdx := len(modes) - 1
+
+	startIdx := 0
+	if parent.conf.ROMount || parent.conf.LazyOpenForWrite {
+		// Skip attempt to open in RDWR based on configuration.
+		startIdx = 1
+	}
+
+	var err error
+	var fd int
+	for i := startIdx; i < len(modes); i++ {
+		fd, err = syscall.Openat(parent.controlFD(), name, openFlags|modes[i], 0)
+		if err == nil {
+			// openat succeeded, we're done.
+			break
+		}
+		switch e := extractErrno(err); e {
+		case syscall.ENOENT:
+			// File doesn't exist, no point in retrying.
+			return nil, "", e
+		case syscall.ELOOP:
+			if i < symlinkIdx {
+				// File was opened with O_NOFOLLOW, so this error can only happen when
+				// trying ot open a symlink. Jump straight to flags compatible with symlink.
+				i = symlinkIdx - 1
+			}
+		}
+		// openat failed. Try again with next mode, preserving 'err' in
+		// case this was the last attempt.
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|modes[i], parent.controlFile.Name(), name, err)
+	}
+	if err != nil {
+		// All attempts to open file have failed, return the last error.
+		log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.controlFile.Name(), name, err)
+		return nil, "", extractErrno(err)
+	}
+
+	parent.mu.Lock()
+	defer parent.mu.Unlock()
+	newPath := path.Join(parent.hostPath, name)
+
+	return os.NewFile(uintptr(fd), newPath), newPath, nil
+}
+
+func newLocalFile(conf Config, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) {
+	var ft fileType
+	switch stat.Mode & syscall.S_IFMT {
+	case syscall.S_IFREG:
+		ft = regular
+	case syscall.S_IFDIR:
+		ft = directory
+	case syscall.S_IFLNK:
+		ft = symlink
+	default:
+		return nil, syscall.EINVAL
+	}
+	return &localFile{
+		hostPath:    path,
+		controlFile: file,
+		conf:        conf,
+		mode:        invalidMode,
+		ft:          ft,
+	}, nil
+}
+
+// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as
+// non-blocking. If anything fails, returns nil. It's better to have a file
+// without host FD, than to fail the operation.
+func newFDMaybe(file *os.File) *fd.FD {
+	fd, err := fd.NewFromFile(file)
+	if err != nil {
+		return nil
+	}
+
+	// fd is blocking; non-blocking is required.
+	if err := syscall.SetNonblock(fd.FD(), true); err != nil {
+		fd.Close()
+		return nil
+	}
+	return fd
+}
+
+func stat(fd int) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(fd, &stat); err != nil {
+		return syscall.Stat_t{}, err
+	}
+	return stat, nil
+}
+
+func fchown(fd int, uid p9.UID, gid p9.GID) error {
+	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+}
+
+func (l *localFile) controlFD() int {
+	return int(l.controlFile.Fd())
+}
+
+func (l *localFile) openedFD() int {
+	if l.openedFile == nil {
+		panic(fmt.Sprintf("trying to use an unopened file: %q", l.controlFile.Name()))
+	}
+	return int(l.openedFile.Fd())
+}
+
+// Open implements p9.File.
+func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+	if l.openedFile != nil {
+		panic(fmt.Sprintf("attempting to open already opened file: %q", l.controlFile.Name()))
+	}
+
+	// Check if control file can be used or if a new open must be created.
+	var newFile *os.File
+	if mode == p9.ReadOnly || !l.conf.LazyOpenForWrite {
+		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name())
+		newFile = l.controlFile
+	} else {
+		// Ideally reopen would call name_to_handle_at (with empty name) and open_by_handle_at
+		// to reopen the file without using 'hostPath'. However, name_to_handle_at and
+		// open_by_handle_at aren't supported by overlay2.
+		log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name())
+		var err error
+
+		l.mu.Lock()
+		newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0)
+		if err != nil {
+			l.mu.Unlock()
+			return nil, p9.QID{}, 0, extractErrno(err)
+		}
+		l.mu.Unlock()
+	}
+
+	stat, err := stat(int(newFile.Fd()))
+	if err != nil {
+		newFile.Close()
+		return nil, p9.QID{}, 0, extractErrno(err)
+	}
+
+	var fd *fd.FD
+	if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+		// Donate FD for regular files only.
+		fd = newFDMaybe(newFile)
+	}
+
+	// Set fields on success
+	l.openedFile = newFile
+	l.mode = mode
+	return fd, makeQID(stat), 0, nil
+}
+
+// Create implements p9.File.
+func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
+	if l.conf.ROMount {
+		return nil, nil, p9.QID{}, 0, syscall.EBADF
+	}
+	if !isNameValid(name) {
+		return nil, nil, p9.QID{}, 0, syscall.EINVAL
+	}
+
+	// Use a single file for both 'controlFile' and 'openedFile'. Mode must include read for control
+	// and whichever else was requested by caller. Note that resulting file might have a wider mode
+	// than needed for each particular case.
+	flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+	if mode == p9.WriteOnly {
+		flags |= syscall.O_RDWR
+	} else {
+		flags |= mode.OSFlags()
+	}
+
+	fd, err := syscall.Openat(l.controlFD(), name, flags, uint32(perm.Permissions()))
+	if err != nil {
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+	if err := fchown(fd, uid, gid); err != nil {
+		syscall.Close(fd)
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+	stat, err := stat(fd)
+	if err != nil {
+		syscall.Close(fd)
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	cPath := path.Join(l.hostPath, name)
+	f := os.NewFile(uintptr(fd), cPath)
+	c := &localFile{
+		hostPath:    cPath,
+		controlFile: f,
+		openedFile:  f,
+		mode:        mode,
+		conf:        l.conf,
+	}
+	return newFDMaybe(c.openedFile), c, makeQID(stat), 0, nil
+}
+
+// Mkdir implements p9.File.
+func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	if l.conf.ROMount {
+		return p9.QID{}, syscall.EBADF
+	}
+
+	if !isNameValid(name) {
+		return p9.QID{}, syscall.EINVAL
+	}
+
+	if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+
+	// Open directory to change ownership and stat it.
+	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+	fd, err := syscall.Openat(l.controlFD(), name, flags, 0)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	defer syscall.Close(fd)
+
+	if err := fchown(fd, uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := stat(fd)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	return makeQID(stat), nil
+}
+
+// Walk implements p9.File.
+func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+	// Duplicate current file if 'names' is empty.
+	if len(names) == 0 {
+		newFd, err := syscall.Dup(l.controlFD())
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+		stat, err := stat(newFd)
+		if err != nil {
+			syscall.Close(newFd)
+			return nil, nil, extractErrno(err)
+		}
+
+		l.mu.Lock()
+		defer l.mu.Unlock()
+
+		c := &localFile{
+			hostPath:    l.hostPath,
+			controlFile: os.NewFile(uintptr(newFd), l.hostPath),
+			mode:        invalidMode,
+			conf:        l.conf,
+		}
+		return []p9.QID{makeQID(stat)}, c, nil
+	}
+
+	var qids []p9.QID
+	last := l
+	for _, name := range names {
+		if !isNameValid(name) {
+			return nil, nil, syscall.EINVAL
+		}
+
+		f, path, err := openAnyFile(last, name)
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+		stat, err := stat(int(f.Fd()))
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+		c, err := newLocalFile(last.conf, f, path, stat)
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+
+		qids = append(qids, makeQID(stat))
+		last = c
+	}
+	return qids, last, nil
+}
+
+// StatFS implements p9.File.
+func (l *localFile) StatFS() (p9.FSStat, error) {
+	var s syscall.Statfs_t
+	if err := syscall.Fstatfs(l.controlFD(), &s); err != nil {
+		return p9.FSStat{}, extractErrno(err)
+	}
+
+	// Populate with what's available.
+	return p9.FSStat{
+		Type:            uint32(s.Type),
+		BlockSize:       uint32(s.Bsize),
+		Blocks:          s.Blocks,
+		BlocksFree:      s.Bfree,
+		BlocksAvailable: s.Bavail,
+		Files:           s.Files,
+		FilesFree:       s.Ffree,
+		NameLength:      uint32(s.Namelen),
+	}, nil
+}
+
+// FSync implements p9.File.
+func (l *localFile) FSync() error {
+	if l.openedFile == nil {
+		return syscall.EBADF
+	}
+	if err := l.openedFile.Sync(); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// GetAttr implements p9.File.
+func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	stat, err := stat(l.controlFD())
+	if err != nil {
+		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
+	}
+
+	attr := p9.Attr{
+		Mode:             p9.FileMode(stat.Mode),
+		UID:              p9.UID(stat.Uid),
+		GID:              p9.GID(stat.Gid),
+		NLink:            stat.Nlink,
+		RDev:             stat.Rdev,
+		Size:             uint64(stat.Size),
+		BlockSize:        uint64(stat.Blksize),
+		Blocks:           uint64(stat.Blocks),
+		ATimeSeconds:     uint64(stat.Atim.Sec),
+		ATimeNanoSeconds: uint64(stat.Atim.Nsec),
+		MTimeSeconds:     uint64(stat.Mtim.Sec),
+		MTimeNanoSeconds: uint64(stat.Mtim.Nsec),
+		CTimeSeconds:     uint64(stat.Ctim.Sec),
+		CTimeNanoSeconds: uint64(stat.Ctim.Nsec),
+	}
+	valid := p9.AttrMask{
+		Mode:   true,
+		UID:    true,
+		GID:    true,
+		NLink:  true,
+		RDev:   true,
+		Size:   true,
+		Blocks: true,
+		ATime:  true,
+		MTime:  true,
+		CTime:  true,
+	}
+
+	return makeQID(stat), valid, attr, nil
+}
+
+// SetAttr implements p9.File. Due to mismatch in file API, options
+// cannot be changed atomicaly and user may see partial changes when
+// an error happens.
+func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
+	if l.conf.ROMount {
+		return syscall.EBADF
+	}
+
+	allowed := p9.SetAttrMask{
+		Permissions:        true,
+		UID:                true,
+		GID:                true,
+		Size:               true,
+		ATime:              true,
+		MTime:              true,
+		ATimeNotSystemTime: true,
+		MTimeNotSystemTime: true,
+	}
+
+	if valid.Empty() {
+		// Nothing to do.
+		return nil
+	}
+
+	// Handle all the sanity checks up front so that the client gets a
+	// consistent result that is not attribute dependent.
+	if !valid.IsSubsetOf(allowed) {
+		log.Warningf("SetAttr() failed for %q, mask: %v", l.controlFile.Name(), valid)
+		return syscall.EPERM
+	}
+
+	fd := l.controlFD()
+	if l.conf.LazyOpenForWrite && l.ft == regular {
+		// Regular files are opened in RO mode when lazy open is set.
+		// Thus it needs to be reopened here for write.
+		f, err := os.OpenFile(l.hostPath, openFlags|os.O_WRONLY, 0)
+		if err != nil {
+			return extractErrno(err)
+		}
+		defer f.Close()
+		fd = int(f.Fd())
+	}
+
+	// The semantics are to either return an error if no changes were made,
+	// or no error if *all* changes were made. Well, this can be impossible
+	// if the filesystem rejects at least one of the changes, especially
+	// since some operations are not easy to undo atomically.
+	//
+	// This could be made better if SetAttr actually returned the changes
+	// it did make, so the client can at least know what has changed. So
+	// we at least attempt to make all of the changes and return a generic
+	// error if any of them fails, which at least doesn't bias any change
+	// over another.
+	var err error
+	if valid.Permissions {
+		if cerr := syscall.Fchmod(fd, uint32(attr.Permissions)); cerr != nil {
+			log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
+			err = extractErrno(cerr)
+		}
+	}
+
+	if valid.Size {
+		if terr := syscall.Ftruncate(fd, int64(attr.Size)); terr != nil {
+			log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
+			err = extractErrno(terr)
+		}
+	}
+
+	if valid.ATime || valid.MTime {
+		utimes := [2]syscall.Timespec{
+			syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
+			syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT},
+		}
+		if valid.ATime {
+			if valid.ATimeNotSystemTime {
+				utimes[0].Sec = int64(attr.ATimeSeconds)
+				utimes[0].Nsec = int64(attr.ATimeNanoSeconds)
+			} else {
+				utimes[0].Nsec = linux.UTIME_NOW
+			}
+		}
+		if valid.MTime {
+			if valid.MTimeNotSystemTime {
+				utimes[1].Sec = int64(attr.MTimeSeconds)
+				utimes[1].Nsec = int64(attr.MTimeNanoSeconds)
+			} else {
+				utimes[1].Nsec = linux.UTIME_NOW
+			}
+		}
+
+		if l.ft == symlink {
+			// utimensat operates different that other syscalls. To operate on a
+			// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
+			// name.
+			f, err := os.OpenFile(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+			if err != nil {
+				return extractErrno(err)
+			}
+			defer f.Close()
+
+			if terr := utimensat(int(f.Fd()), path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
+				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+				err = extractErrno(terr)
+			}
+		} else {
+			// Directories and regular files can operate directly on the fd
+			// using empty name.
+			if terr := utimensat(fd, "", utimes, 0); terr != nil {
+				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+				err = extractErrno(terr)
+			}
+		}
+	}
+
+	if valid.UID || valid.GID {
+		uid := -1
+		if valid.UID {
+			uid = int(attr.UID)
+		}
+		gid := -1
+		if valid.GID {
+			gid = int(attr.GID)
+		}
+		if oerr := syscall.Fchownat(fd, "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
+			err = extractErrno(oerr)
+		}
+	}
+
+	return err
+}
+
+// Remove implements p9.File.
+//
+// This is deprecated in favor of UnlinkAt.
+func (*localFile) Remove() error {
+	return syscall.ENOSYS
+}
+
+// Rename implements p9.File.
+func (l *localFile) Rename(directory p9.File, name string) error {
+	if l.conf.ROMount {
+		return syscall.EBADF
+	}
+	if !isNameValid(name) {
+		return syscall.EINVAL
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	// TODO: change to renameat(2)
+	parent := directory.(*localFile)
+	newPath := path.Join(parent.hostPath, name)
+	if err := os.Rename(l.hostPath, newPath); err != nil {
+		return extractErrno(err)
+	}
+
+	// Update path on success.
+	// TODO: this doesn't cover cases where any of the
+	// parents have been renamed.
+	l.hostPath = newPath
+	return nil
+}
+
+// RenameAt implements p9.File.RenameAt.
+//
+// Code still uses [deprecated] Rename().
+func (*localFile) RenameAt(_ string, _ p9.File, _ string) error {
+	return syscall.ENOSYS
+}
+
+// ReadAt implements p9.File.
+func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
+	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+		return 0, syscall.EBADF
+	}
+	if l.openedFile == nil {
+		return 0, syscall.EBADF
+	}
+
+	r, err := l.openedFile.ReadAt(p, int64(offset))
+	switch err {
+	case nil, io.EOF:
+		return r, nil
+	default:
+		return r, extractErrno(err)
+	}
+}
+
+// WriteAt implements p9.File.
+func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
+	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+		return 0, syscall.EBADF
+	}
+	if l.openedFile == nil {
+		return 0, syscall.EBADF
+	}
+
+	w, err := l.openedFile.WriteAt(p, int64(offset))
+	if err != nil {
+		return w, extractErrno(err)
+	}
+	return w, nil
+}
+
+// Symlink implements p9.File.
+func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	if l.conf.ROMount {
+		return p9.QID{}, syscall.EBADF
+	}
+	if !isNameValid(newName) {
+		return p9.QID{}, syscall.EINVAL
+	}
+
+	if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+
+	// Open symlink to change ownership and stat it.
+	fd, err := syscall.Openat(l.controlFD(), newName, unix.O_PATH|openFlags, 0)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	defer syscall.Close(fd)
+
+	if err := fchown(fd, uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := stat(fd)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	return makeQID(stat), nil
+}
+
+// Link implements p9.File.
+func (l *localFile) Link(target p9.File, newName string) error {
+	if l.conf.ROMount {
+		return syscall.EBADF
+	}
+	if !isNameValid(newName) {
+		return syscall.EINVAL
+	}
+
+	targetFile := target.(*localFile)
+	if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// Mknod implements p9.File.
+//
+// Not implemented.
+func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+	return p9.QID{}, syscall.ENOSYS
+}
+
+// UnlinkAt implements p9.File.
+func (l *localFile) UnlinkAt(name string, flags uint32) error {
+	if l.conf.ROMount {
+		return syscall.EBADF
+	}
+	if !isNameValid(name) {
+		return syscall.EINVAL
+	}
+	if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// Readdir implements p9.File.
+func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
+	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+		return nil, syscall.EBADF
+	}
+	if l.openedFile == nil {
+		return nil, syscall.EBADF
+	}
+
+	// Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
+	// reading all directory contents. Take a lock because this operation is stateful.
+	l.readDirMu.Lock()
+	if _, err := l.openedFile.Seek(0, 0); err != nil {
+		l.readDirMu.Unlock()
+		return nil, extractErrno(err)
+	}
+	names, err := l.openedFile.Readdirnames(-1)
+	if err != nil {
+		l.readDirMu.Unlock()
+		return nil, extractErrno(err)
+	}
+	l.readDirMu.Unlock()
+
+	var dirents []p9.Dirent
+	for i := int(offset); i >= 0 && i < len(names); i++ {
+		stat, err := statAt(l.openedFD(), names[i])
+		if err != nil {
+			continue
+		}
+		qid := makeQID(stat)
+		dirents = append(dirents, p9.Dirent{
+			QID:    qid,
+			Type:   qid.Type,
+			Name:   names[i],
+			Offset: uint64(i + 1),
+		})
+	}
+	return dirents, nil
+}
+
+// Readlink implements p9.File.
+func (l *localFile) Readlink() (string, error) {
+	// Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
+	for len := 128; len < 1024*1024; len *= 2 {
+		b := make([]byte, len)
+		n, err := unix.Readlinkat(l.controlFD(), "", b)
+		if err != nil {
+			return "", extractErrno(err)
+		}
+		if n < len {
+			return string(b[:n]), nil
+		}
+	}
+	return "", syscall.ENOMEM
+}
+
+// Flush implements p9.File.
+func (l *localFile) Flush() error {
+	return nil
+}
+
+// Connect implements p9.File.
+func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
+	return nil, syscall.ECONNREFUSED
+}
+
+// Close implements p9.File.
+func (l *localFile) Close() error {
+	err := l.controlFile.Close()
+
+	// Close only once in case opened and control files point to
+	// the same os.File struct.
+	if l.openedFile != nil && l.openedFile != l.controlFile {
+		err = l.openedFile.Close()
+	}
+
+	l.openedFile = nil
+	l.controlFile = nil
+	l.mode = invalidMode
+	return err
+}
+
+// extractErrno tries to determine the errno.
+func extractErrno(err error) syscall.Errno {
+	if err == nil {
+		// This should never happen. The likely result will be that
+		// some user gets the frustration "error: SUCCESS" message.
+		log.Warningf("extractErrno called with nil error!")
+		return 0
+	}
+
+	switch err {
+	case os.ErrNotExist:
+		return syscall.ENOENT
+	case os.ErrExist:
+		return syscall.EEXIST
+	case os.ErrPermission:
+		return syscall.EACCES
+	case os.ErrInvalid:
+		return syscall.EINVAL
+	}
+
+	// See if it's an errno or a common wrapped error.
+	switch e := err.(type) {
+	case syscall.Errno:
+		return e
+	case *os.PathError:
+		return extractErrno(e.Err)
+	case *os.LinkError:
+		return extractErrno(e.Err)
+	case *os.SyscallError:
+		return extractErrno(e.Err)
+	}
+
+	// Fall back to EIO.
+	log.Debugf("Unknown error: %v, defaulting to EIO", err)
+	return syscall.EIO
+}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
new file mode 100644
index 000000000..7d834d596
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -0,0 +1,576 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"syscall"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/p9"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+
+	allConfs = append(allConfs, rwConfs...)
+	allConfs = append(allConfs, roConfs...)
+}
+
+var (
+	allTypes = []fileType{regular, directory, symlink}
+
+	// allConfs is set in init() above.
+	allConfs []Config
+
+	rwConfs = []Config{
+		Config{ROMount: false, LazyOpenForWrite: false},
+		Config{ROMount: false, LazyOpenForWrite: true},
+	}
+	roConfs = []Config{
+		Config{ROMount: true, LazyOpenForWrite: false},
+		Config{ROMount: true, LazyOpenForWrite: true},
+	}
+)
+
+type state struct {
+	root *localFile
+	file *localFile
+	conf Config
+	ft   fileType
+}
+
+func (s state) String() string {
+	return fmt.Sprintf("lazyopen(%v)-%v", s.conf.LazyOpenForWrite, s.ft)
+}
+
+func runAll(t *testing.T, test func(*testing.T, state)) {
+	runCustom(t, allTypes, allConfs, test)
+}
+
+func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) {
+	for _, c := range confs {
+		t.Logf("Config: %+v", c)
+
+		for _, ft := range types {
+			t.Logf("File type: %v", ft)
+
+			path, name, err := setup(ft)
+			if err != nil {
+				t.Fatalf("%v", err)
+			}
+			defer os.RemoveAll(path)
+
+			a := NewAttachPoint(path, c)
+			root, err := a.Attach("/")
+			if err != nil {
+				t.Fatalf("Attach(%q) failed, err: %v", "/", err)
+			}
+
+			_, file, err := root.Walk([]string{name})
+			if err != nil {
+				root.Close()
+				t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
+			}
+
+			st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft}
+			test(t, st)
+			file.Close()
+			root.Close()
+		}
+	}
+}
+
+func setup(ft fileType) (string, string, error) {
+	path, err := ioutil.TempDir("", "root-")
+	if err != nil {
+		return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err)
+	}
+
+	// First attach with writable configuiration to setup tree.
+	a := NewAttachPoint(path, Config{})
+	root, err := a.Attach("/")
+	if err != nil {
+		return "", "", fmt.Errorf("Attach(%q) failed, err: %v", "/", err)
+	}
+	defer root.Close()
+
+	var name string
+	switch ft {
+	case regular:
+		name = "file"
+		_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
+		}
+		defer f.Close()
+	case directory:
+		name = "dir"
+		if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
+		}
+	case symlink:
+		name = "symlink"
+		if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
+		}
+	default:
+		panic(fmt.Sprintf("unknown file type %v", ft))
+	}
+	return path, name, nil
+}
+
+func createFile(dir *localFile, name string) (*localFile, error) {
+	_, f, _, _, err := dir.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+	if err != nil {
+		return nil, err
+	}
+	return f.(*localFile), nil
+}
+
+func TestReadWrite(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		child, err := createFile(s.file, "test")
+		if err != nil {
+			t.Fatalf("%v: createFile() failed, err: %v", s, err)
+		}
+		defer child.Close()
+		b := []byte("foobar")
+		w, err := child.WriteAt(b, 0)
+		if err != nil {
+			t.Fatalf("%v: Write() failed, err: %v", s, err)
+		}
+		if w != len(b) {
+			t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(b))
+		}
+		for _, test := range []struct {
+			flags p9.OpenFlags
+			read  bool
+			write bool
+		}{
+			{flags: p9.ReadOnly, read: true, write: false},
+			{flags: p9.WriteOnly, read: false, write: true},
+			{flags: p9.ReadWrite, read: true, write: true},
+		} {
+			_, l, err := s.file.Walk([]string{"test"})
+			if err != nil {
+				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
+			}
+			if _, _, _, err := l.Open(test.flags); err != nil {
+				t.Fatalf("%v: Open(%v) failed, err: %v", s, test.flags, err)
+			}
+
+			w, err = l.WriteAt(b, 0)
+			if test.write {
+				if err != nil {
+					t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+				}
+				if w != len(b) {
+					t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b))
+				}
+			} else {
+				if err == nil {
+					t.Fatalf("%v, %v: WriteAt() should have failed", s, test.flags)
+				}
+			}
+
+			rBuf := make([]byte, len(b))
+			r, err := l.ReadAt(rBuf, 0)
+			if test.read {
+				if err != nil {
+					t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err)
+				}
+				if r != len(rBuf) {
+					t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf))
+				}
+				if string(rBuf) != "foobar" {
+					t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar")
+				}
+			} else {
+				if err == nil {
+					t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags)
+				}
+			}
+		}
+	})
+}
+
+func TestCreate(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		for i, test := range []struct {
+			flags p9.OpenFlags
+			read  bool
+		}{
+			{flags: p9.WriteOnly, read: false},
+			{flags: p9.ReadWrite, read: true},
+		} {
+			_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), test.flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+			if err != nil {
+				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+			}
+
+			b := []byte("foobar")
+			w, err := l.WriteAt(b, 0)
+			if err != nil {
+				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err)
+			}
+			if w != len(b) {
+				t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b))
+			}
+
+			rBuf := make([]byte, len(b))
+			r, err := l.ReadAt(rBuf, 0)
+			if test.read {
+				if err != nil {
+					t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err)
+				}
+				if r != len(rBuf) {
+					t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf))
+				}
+				if string(rBuf) != "foobar" {
+					t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar")
+				}
+			} else {
+				if err == nil {
+					t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags)
+				}
+			}
+		}
+	})
+}
+
+func TestUnopened(t *testing.T) {
+	runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) {
+		b := []byte("foobar")
+		if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
+			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF {
+			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Readdir(0, 100); err != syscall.EBADF {
+			t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.FSync(); err != syscall.EBADF {
+			t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+	})
+}
+
+func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) {
+	if err := l.SetAttr(valid, attr); err != nil {
+		return p9.Attr{}, err
+	}
+	_, _, a, err := l.GetAttr(p9.AttrMask{})
+	if err != nil {
+		return p9.Attr{}, err
+	}
+	return a, nil
+}
+
+func TestSetAttrPerm(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		valid := p9.SetAttrMask{Permissions: true}
+		attr := p9.SetAttr{Permissions: 0777}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if s.ft == symlink {
+			if err == nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
+			}
+		} else {
+			if err != nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Permissions, err)
+			}
+			if got.Mode.Permissions() != attr.Permissions {
+				t.Errorf("%v: wrong permission, got: %v, expected: %v", s, got.Mode.Permissions(), attr.Permissions)
+			}
+		}
+	})
+}
+
+func TestSetAttrSize(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		for _, size := range []uint64{1024, 0, 1024 * 1024} {
+			valid := p9.SetAttrMask{Size: true}
+			attr := p9.SetAttr{Size: size}
+			got, err := SetGetAttr(s.file, valid, attr)
+			if s.ft == symlink || s.ft == directory {
+				if err == nil {
+					t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
+				}
+				// Run for one size only, they will all fail the same way.
+				return
+			}
+			if err != nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Size, err)
+			}
+			if got.Size != size {
+				t.Errorf("%v: wrong size, got: %v, expected: %v", s, got.Size, size)
+			}
+		}
+	})
+}
+
+func TestSetAttrTime(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		valid := p9.SetAttrMask{ATime: true, ATimeNotSystemTime: true}
+		attr := p9.SetAttr{ATimeSeconds: 123, ATimeNanoSeconds: 456}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.ATimeSeconds, attr.ATimeNanoSeconds, err)
+		}
+		if got.ATimeSeconds != 123 {
+			t.Errorf("%v: wrong ATimeSeconds, got: %v, expected: %v", s, got.ATimeSeconds, 123)
+		}
+		if got.ATimeNanoSeconds != 456 {
+			t.Errorf("%v: wrong ATimeNanoSeconds, got: %v, expected: %v", s, got.ATimeNanoSeconds, 456)
+		}
+
+		valid = p9.SetAttrMask{MTime: true, MTimeNotSystemTime: true}
+		attr = p9.SetAttr{MTimeSeconds: 789, MTimeNanoSeconds: 012}
+		got, err = SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.MTimeSeconds, attr.MTimeNanoSeconds, err)
+		}
+		if got.MTimeSeconds != 789 {
+			t.Errorf("%v: wrong MTimeSeconds, got: %v, expected: %v", s, got.MTimeSeconds, 789)
+		}
+		if got.MTimeNanoSeconds != 012 {
+			t.Errorf("%v: wrong MTimeNanoSeconds, got: %v, expected: %v", s, got.MTimeNanoSeconds, 012)
+		}
+	})
+}
+
+func TestSetAttrOwner(t *testing.T) {
+	if os.Getuid() != 0 {
+		t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid())
+	}
+
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		newUID := os.Getuid() + 1
+		valid := p9.SetAttrMask{UID: true}
+		attr := p9.SetAttr{UID: p9.UID(newUID)}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.UID, err)
+		}
+		if got.UID != p9.UID(newUID) {
+			t.Errorf("%v: wrong uid, got: %v, expected: %v", s, got.UID, newUID)
+		}
+	})
+}
+
+func TestLink(t *testing.T) {
+	if os.Getuid() != 0 {
+		t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid())
+	}
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		const dirName = "linkdir"
+		const linkFile = "link"
+		if _, err := s.root.Mkdir(dirName, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, dirName, err)
+		}
+		_, dir, err := s.root.Walk([]string{dirName})
+		if err != nil {
+			t.Fatalf("%v: Walk({%s}) failed, err: %v", s, dirName, err)
+		}
+
+		err = dir.Link(s.file, linkFile)
+		if s.ft == directory {
+			if err != syscall.EPERM {
+				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
+			}
+			return
+		}
+		if err != nil {
+			t.Errorf("%v: Link(target, %s) failed, err: %v", s, linkFile, err)
+		}
+	})
+}
+
+func TestROMountChecks(t *testing.T) {
+	runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
+		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.Rename(s.file, ".."); err != syscall.EBADF {
+			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.UnlinkAt("..", 0); err != syscall.EBADF {
+			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.Link(s.file, ".."); err != syscall.EBADF {
+			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+
+		valid := p9.SetAttrMask{Size: true}
+		attr := p9.SetAttr{Size: 0}
+		if err := s.file.SetAttr(valid, attr); err != syscall.EBADF {
+			t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+	})
+}
+
+func TestInvalidName(t *testing.T) {
+	runCustom(t, []fileType{regular}, rwConfs, func(t *testing.T, s state) {
+		if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if _, _, err := s.file.Walk([]string{".."}); err != syscall.EINVAL {
+			t.Errorf("%v: Walk() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if err := s.file.Rename(s.file, ".."); err != syscall.EINVAL {
+			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL {
+			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if err := s.file.UnlinkAt("..", 0); err != syscall.EINVAL {
+			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+		if err := s.file.Link(s.file, ".."); err != syscall.EINVAL {
+			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EINVAL", s, err)
+		}
+	})
+}
+
+func TestIsNameValid(t *testing.T) {
+	valid := []string{
+		"name",
+		"123",
+		"!@#$%^&*()",
+		".name",
+		"..name",
+		"...",
+	}
+	for _, s := range valid {
+		if got := isNameValid(s); !got {
+			t.Errorf("isNameValid(%s) failed, got: %v, expected: true", s, got)
+		}
+	}
+	invalid := []string{
+		".",
+		"..",
+		"name/name",
+		"/name",
+		"name/",
+	}
+	for _, s := range invalid {
+		if got := isNameValid(s); got {
+			t.Errorf("isNameValid(%s) failed, got: %v, expected: false", s, got)
+		}
+	}
+}
+
+func TestWalkNotFound(t *testing.T) {
+	runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
+		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
+			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
+		}
+	})
+}
+
+func TestWalkDup(t *testing.T) {
+	runAll(t, func(t *testing.T, s state) {
+		_, dup, err := s.file.Walk([]string{})
+		if err != nil {
+			t.Fatalf("%v: Walk(nil) failed, err: %v", s, err)
+		}
+		// Check that 'dup' is usable.
+		if _, _, _, err := dup.GetAttr(p9.AttrMask{}); err != nil {
+			t.Errorf("%v: GetAttr() failed, err: %v", s, err)
+		}
+	})
+}
+
+func TestReaddir(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		name := "dir"
+		if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
+		}
+		name = "symlink"
+		if _, err := s.file.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: Symlink(%q) failed, err: %v", s, name, err)
+		}
+		name = "file"
+		_, f, _, _, err := s.file.Create(name, p9.ReadWrite, 0555, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			t.Fatalf("%v: createFile(root, %q) failed, err: %v", s, name, err)
+		}
+		f.Close()
+
+		if _, _, _, err := s.file.Open(p9.ReadOnly); err != nil {
+			t.Fatalf("%v: Open(ReadOnly) failed, err: %v", s, err)
+		}
+
+		dirents, err := s.file.Readdir(0, 10)
+		if err != nil {
+			t.Fatalf("%v: Readdir(0, 10) failed, err: %v", s, err)
+		}
+		if len(dirents) != 3 {
+			t.Fatalf("%v: Readdir(0, 10) wrong number of items, got: %v, expected: 3", s, len(dirents))
+		}
+		var dir, symlink, file bool
+		for _, d := range dirents {
+			switch d.Name {
+			case "dir":
+				if d.Type != p9.TypeDir {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeDir)
+				}
+				dir = true
+			case "symlink":
+				if d.Type != p9.TypeSymlink {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeSymlink)
+				}
+				symlink = true
+			case "file":
+				if d.Type != p9.TypeRegular {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeRegular)
+				}
+				file = true
+			default:
+				t.Errorf("%v: dirent.Name got: %v", s, d.Name)
+			}
+
+			_, f, err := s.file.Walk([]string{d.Name})
+			if err != nil {
+				t.Fatalf("%v: Walk({%s}) failed, err: %v", s, d.Name, err)
+			}
+			_, _, a, err := f.GetAttr(p9.AttrMask{})
+			if err != nil {
+				t.Fatalf("%v: GetAttr() failed, err: %v", s, err)
+			}
+			if d.Type != a.Mode.QIDType() {
+				t.Errorf("%v: dirent.Type different than GetAttr().Mode.QIDType(), got: %v, expected: %v", s, d.Type, a.Mode.QIDType())
+			}
+		}
+		if !dir || !symlink || !file {
+			t.Errorf("%v: Readdir(0, 10) wrong files returned, dir: %v, symlink: %v, file: %v", s, dir, symlink, file)
+		}
+	})
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
new file mode 100644
index 000000000..e676809ac
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -0,0 +1,58 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+	nameBytes, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return syscall.Stat_t{}, extractErrno(err)
+	}
+	namePtr := uintptr(unsafe.Pointer(nameBytes))
+
+	var stat syscall.Stat_t
+	statPtr := uintptr(unsafe.Pointer(&stat))
+
+	if _, _, err := syscall.Syscall6(syscall.SYS_NEWFSTATAT, uintptr(dirFd), namePtr, statPtr, linux.AT_SYMLINK_NOFOLLOW, 0, 0); err != 0 {
+		return syscall.Stat_t{}, err
+	}
+	return stat, nil
+}
+
+func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
+	// operate directly on 'dirFd' unlike other *at syscalls.
+	var namePtr uintptr
+	if name != "" {
+		nameBytes, err := syscall.BytePtrFromString(name)
+		if err != nil {
+			return extractErrno(err)
+		}
+		namePtr = uintptr(unsafe.Pointer(nameBytes))
+	}
+
+	timesPtr := uintptr(unsafe.Pointer(&times[0]))
+
+	if _, _, err := syscall.Syscall6(syscall.SYS_UTIMENSAT, uintptr(dirFd), namePtr, timesPtr, uintptr(flags), 0, 0); err != 0 {
+		return err
+	}
+	return nil
+}
diff --git a/runsc/main.go b/runsc/main.go
new file mode 100644
index 000000000..cf4b99d3f
--- /dev/null
+++ b/runsc/main.go
@@ -0,0 +1,199 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary runsc is an implementation of the Open Container Initiative Runtime
+// that runs applications inside a sandbox.
+package main
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"time"
+
+	"context"
+	"flag"
+
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/cmd"
+)
+
+var (
+	// Although these flags are not part of the OCI spec, they are used by
+	// Docker, and thus should not be changed.
+	rootDir     = flag.String("root", "", "root directory for storage of container state")
+	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
+	logFormat   = flag.String("log-format", "text", "log format: text (default) or json")
+	debug       = flag.Bool("debug", false, "enable debug logging")
+
+	// These flags are unique to runsc, and are used to configure parts of the
+	// system that are not covered by the runtime spec.
+
+	// Debugging flags.
+	debugLogDir = flag.String("debug-log-dir", "", "additional location for logs. It creates individual log files per command")
+	logPackets  = flag.Bool("log-packets", false, "enable network packet logging")
+
+	// Debugging flags: strace related
+	strace         = flag.Bool("strace", false, "enable strace")
+	straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
+
+	// Flags that control sandbox runtime behavior.
+	platform   = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+	network    = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+	fileAccess = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.")
+	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+)
+
+func main() {
+	// Help and flags commands are generated automatically.
+	subcommands.Register(subcommands.HelpCommand(), "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+
+	// Register user-facing runsc commands.
+	subcommands.Register(new(cmd.Create), "")
+	subcommands.Register(new(cmd.Delete), "")
+	subcommands.Register(new(cmd.Events), "")
+	subcommands.Register(new(cmd.Exec), "")
+	subcommands.Register(new(cmd.Gofer), "")
+	subcommands.Register(new(cmd.Kill), "")
+	subcommands.Register(new(cmd.List), "")
+	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Run), "")
+	subcommands.Register(new(cmd.Start), "")
+	subcommands.Register(new(cmd.State), "")
+
+	// Register internal commands with the internal group name. This causes
+	// them to be sorted below the user-facing commands with empty group.
+	// The string below will be printed above the commands.
+	const internalGroup = "internal use only"
+	subcommands.Register(new(cmd.Boot), internalGroup)
+	subcommands.Register(new(cmd.Gofer), internalGroup)
+
+	// All subcommands must be registered before flag parsing.
+	flag.Parse()
+
+	platformType, err := boot.MakePlatformType(*platform)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	fsAccess, err := boot.MakeFileAccessType(*fileAccess)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	netType, err := boot.MakeNetworkType(*network)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	// Create a new Config from the flags.
+	conf := &boot.Config{
+		RootDir:       *rootDir,
+		FileAccess:    fsAccess,
+		Overlay:       *overlay,
+		Network:       netType,
+		LogPackets:    *logPackets,
+		Platform:      platformType,
+		Strace:        *strace,
+		StraceLogSize: *straceLogSize,
+	}
+	if len(*straceSyscalls) != 0 {
+		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
+	}
+
+	// Set up logging.
+	if *debug {
+		log.SetLevel(log.Debug)
+	}
+
+	var logFile io.Writer = os.Stderr
+	if *logFilename != "" {
+		f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+		}
+		logFile = f
+	}
+
+	var e log.Emitter
+	switch *logFormat {
+	case "text":
+		e = log.GoogleEmitter{&log.Writer{Next: logFile}}
+	case "json":
+		e = log.JSONEmitter{log.Writer{Next: logFile}}
+	default:
+		cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat)
+	}
+
+	if *debugLogDir != "" {
+		if err := os.MkdirAll(*debugLogDir, 0775); err != nil {
+			cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err)
+		}
+
+		// Format: <debug-log-dir>/runsc.log.<yyymmdd-hhmmss.uuuuuu>.<command>
+		scmd := flag.CommandLine.Arg(0)
+		filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), scmd)
+		path := filepath.Join(*debugLogDir, filename)
+		f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", filename, err)
+		}
+		e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}}
+	}
+
+	log.SetTarget(e)
+
+	log.Infof("***************************")
+	log.Infof("Args: %s", os.Args)
+	log.Infof("PID: %d", os.Getpid())
+	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
+	log.Infof("Configuration:")
+	log.Infof("\t\tRootDir: %s", conf.RootDir)
+	log.Infof("\t\tPlatform: %v", conf.Platform)
+	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
+	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
+	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+	log.Infof("***************************")
+
+	// Call the subcommand and pass in the configuration.
+	var ws syscall.WaitStatus
+	subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+	if subcmdCode == subcommands.ExitSuccess {
+		log.Infof("Exiting with status: %v", ws)
+		if ws.Signaled() {
+			// No good way to return it, emulate what the shell does. Maybe raise
+			// signall to self?
+			os.Exit(128 + int(ws.Signal()))
+		}
+		os.Exit(ws.ExitStatus())
+	}
+	// Return an error that is unlikely to be used by the application.
+	log.Warningf("Failure to execute command, err: %v", subcmdCode)
+	os.Exit(128)
+}
+
+func init() {
+	// Set default root dir to something (hopefully) user-writeable.
+	*rootDir = "/var/run/runsc"
+	if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+		*rootDir = filepath.Join(runtimeDir, "runsc")
+	}
+}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
new file mode 100644
index 000000000..bdd95903e
--- /dev/null
+++ b/runsc/sandbox/BUILD
@@ -0,0 +1,53 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "sandbox",
+    srcs = [
+        "console.go",
+        "hook.go",
+        "namespace.go",
+        "network.go",
+        "sandbox.go",
+        "status.go",
+    ],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/sandbox",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/control/client",
+        "//pkg/control/server",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/urpc",
+        "//runsc/boot",
+        "//runsc/specutils",
+        "@com_github_kr_pty//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_vishvananda_netlink//:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "sandbox_test",
+    size = "small",
+    srcs = ["sandbox_test.go"],
+    pure = "on",
+    rundir = ".",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/unet",
+        "//runsc/boot",
+        "//runsc/cmd",
+        "//runsc/sandbox",
+        "@com_github_google_subcommands//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/sandbox/console.go b/runsc/sandbox/console.go
new file mode 100644
index 000000000..3f133e12a
--- /dev/null
+++ b/runsc/sandbox/console.go
@@ -0,0 +1,60 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"net"
+	"os"
+
+	"github.com/kr/pty"
+	"golang.org/x/sys/unix"
+)
+
+// setupConsole creates pty master/slave pair, sends the master FD over the
+// given socket, and returns the slave.
+func setupConsole(socketPath string) (*os.File, error) {
+	// Create a new pty master and slave.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		return nil, fmt.Errorf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+
+	// Get a connection to the socket path.
+	conn, err := net.Dial("unix", socketPath)
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err)
+	}
+	uc, ok := conn.(*net.UnixConn)
+	if !ok {
+		ptySlave.Close()
+		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
+	}
+	socket, err := uc.File()
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err)
+	}
+
+	// Send the master FD over the connection.
+	msg := unix.UnixRights(int(ptyMaster.Fd()))
+	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err)
+	}
+	return ptySlave, nil
+}
diff --git a/runsc/sandbox/hook.go b/runsc/sandbox/hook.go
new file mode 100644
index 000000000..40b064cdc
--- /dev/null
+++ b/runsc/sandbox/hook.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// This file implements hooks as defined in OCI spec:
+// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22
+//
+// "hooks":{
+// 		"prestart":[{
+// 			"path":"/usr/bin/dockerd",
+// 			"args":[
+// 				"libnetwork-setkey", "arg2",
+// 			]
+// 		}]
+// },
+
+// executeHooksBestEffort executes hooks and logs warning in case they fail.
+// Runs all hooks, always.
+func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
+	for _, h := range hooks {
+		if err := executeHook(h, s); err != nil {
+			log.Warningf("Failure to execute hook %+v, err: %v", h, err)
+		}
+	}
+}
+
+// executeHooks executes hooks until the first one fails or they all execute.
+func executeHooks(hooks []specs.Hook, s specs.State) error {
+	for _, h := range hooks {
+		if err := executeHook(h, s); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func executeHook(h specs.Hook, s specs.State) error {
+	log.Debugf("Executing hook %+v, state: %+v", h, s)
+
+	if strings.TrimSpace(h.Path) == "" {
+		return fmt.Errorf("empty path for hook")
+	}
+	if !filepath.IsAbs(h.Path) {
+		return fmt.Errorf("path for hook is not absolute: %q", h.Path)
+	}
+
+	b, err := json.Marshal(s)
+	if err != nil {
+		return err
+	}
+	var stdout, stderr bytes.Buffer
+	cmd := exec.Cmd{
+		Path:   h.Path,
+		Args:   h.Args,
+		Env:    h.Env,
+		Stdin:  bytes.NewReader(b),
+		Stdout: &stdout,
+		Stderr: &stderr,
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+
+	c := make(chan error, 1)
+	go func() {
+		c <- cmd.Wait()
+	}()
+
+	var timer <-chan time.Time
+	if h.Timeout != nil {
+		timer = time.After(time.Duration(*h.Timeout) * time.Second)
+	}
+	select {
+	case err := <-c:
+		if err != nil {
+			return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String())
+		}
+	case <-timer:
+		cmd.Process.Kill()
+		cmd.Wait()
+		return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String())
+	}
+
+	log.Debugf("Execute hook %q success!", h.Path)
+	return nil
+}
diff --git a/runsc/sandbox/namespace.go b/runsc/sandbox/namespace.go
new file mode 100644
index 000000000..1d3bcfbb5
--- /dev/null
+++ b/runsc/sandbox/namespace.go
@@ -0,0 +1,204 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// nsCloneFlag returns the clone flag that can be used to set a namespace of
+// the given type.
+func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
+	switch nst {
+	case specs.IPCNamespace:
+		return syscall.CLONE_NEWIPC
+	case specs.MountNamespace:
+		return syscall.CLONE_NEWNS
+	case specs.NetworkNamespace:
+		return syscall.CLONE_NEWNET
+	case specs.PIDNamespace:
+		return syscall.CLONE_NEWPID
+	case specs.UTSNamespace:
+		return syscall.CLONE_NEWUTS
+	case specs.UserNamespace:
+		return syscall.CLONE_NEWUSER
+	case specs.CgroupNamespace:
+		panic("cgroup namespace has no associated clone flag")
+	default:
+		panic(fmt.Sprintf("unknown namespace %v", nst))
+	}
+}
+
+// nsPath returns the path of the namespace for the current process and the
+// given namespace.
+func nsPath(nst specs.LinuxNamespaceType) string {
+	base := "/proc/self/ns"
+	switch nst {
+	case specs.CgroupNamespace:
+		return filepath.Join(base, "cgroup")
+	case specs.IPCNamespace:
+		return filepath.Join(base, "ipc")
+	case specs.MountNamespace:
+		return filepath.Join(base, "mnt")
+	case specs.NetworkNamespace:
+		return filepath.Join(base, "net")
+	case specs.PIDNamespace:
+		return filepath.Join(base, "pid")
+	case specs.UserNamespace:
+		return filepath.Join(base, "user")
+	case specs.UTSNamespace:
+		return filepath.Join(base, "uts")
+	default:
+		panic(fmt.Sprintf("unknown namespace %v", nst))
+	}
+}
+
+// getNS returns true and the namespace with the given type from the slice of
+// namespaces in the spec.  It returns false if the slice does not contain a
+// namespace with the type.
+func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
+	if s.Linux == nil {
+		return specs.LinuxNamespace{}, false
+	}
+	for _, ns := range s.Linux.Namespaces {
+		if ns.Type == nst {
+			return ns, true
+		}
+	}
+	return specs.LinuxNamespace{}, false
+}
+
+// filterNS returns a slice of namespaces from the spec with types that match
+// those in the `filter` slice.
+func filterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
+	if s.Linux == nil {
+		return nil
+	}
+	var out []specs.LinuxNamespace
+	for _, nst := range filter {
+		if ns, ok := getNS(nst, s); ok {
+			out = append(out, ns)
+		}
+	}
+	return out
+}
+
+// setNS sets the namespace of the given type.  It must be called with
+// OSThreadLocked.
+func setNS(fd, nsType uintptr) error {
+	if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
+		return err
+	}
+	return nil
+}
+
+// applyNS applies the namespace on the current thread and returns a function
+// that will restore the namespace to the original value.
+//
+// Preconditions: Must be called with os thread locked.
+func applyNS(ns specs.LinuxNamespace) (func(), error) {
+	log.Infof("applying namespace %v at path %q", ns.Type, ns.Path)
+	newNS, err := os.Open(ns.Path)
+	if err != nil {
+		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
+	}
+	defer newNS.Close()
+
+	// Store current netns to restore back after child is started.
+	curPath := nsPath(ns.Type)
+	oldNS, err := os.Open(curPath)
+	if err != nil {
+		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
+	}
+
+	// Set netns to the one requested and setup function to restore it back.
+	flag := nsCloneFlag(ns.Type)
+	if err := setNS(newNS.Fd(), flag); err != nil {
+		oldNS.Close()
+		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
+	}
+	return func() {
+		log.Infof("restoring namespace %v", ns.Type)
+		defer oldNS.Close()
+		if err := setNS(oldNS.Fd(), flag); err != nil {
+			panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
+		}
+	}, nil
+}
+
+// startInNS joins or creates the given namespaces and calls cmd.Start before
+// restoring the namespaces to the original values.
+func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
+	// We are about to setup namespaces, which requires the os thread being
+	// locked so that Go doesn't change the thread out from under us.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if cmd.SysProcAttr == nil {
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+
+	for _, ns := range nss {
+		if ns.Path == "" {
+			// No path.  Just set a flag to create a new namespace.
+			cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
+			continue
+		}
+		// Join the given namespace, and restore the current namespace
+		// before exiting.
+		restoreNS, err := applyNS(ns)
+		if err != nil {
+			return err
+		}
+		defer restoreNS()
+	}
+
+	return cmd.Start()
+}
+
+// setUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
+func setUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
+	if s.Linux == nil {
+		return
+	}
+	if cmd.SysProcAttr == nil {
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+	for _, idMap := range s.Linux.UIDMappings {
+		log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+		cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
+			ContainerID: int(idMap.ContainerID),
+			HostID:      int(idMap.HostID),
+			Size:        int(idMap.Size),
+		})
+	}
+	for _, idMap := range s.Linux.GIDMappings {
+		log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+		cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
+			ContainerID: int(idMap.ContainerID),
+			HostID:      int(idMap.HostID),
+			Size:        int(idMap.Size),
+		})
+	}
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
new file mode 100644
index 000000000..1b6a1d9a6
--- /dev/null
+++ b/runsc/sandbox/network.go
@@ -0,0 +1,348 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"net"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/vishvananda/netlink"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+)
+
+// setupNetwork configures the network stack to mimic the local network
+// configuration. Docker uses network namespaces with vnets to configure the
+// network for the container. The untrusted app expects to see the same network
+// inside the sandbox. Routing and port mapping is handled directly by docker
+// with most of network information not even available to the runtime.
+//
+// Netstack inside the sandbox speaks directly to the device using a raw socket.
+// All IP addresses assigned to the NIC, are removed and passed on to netstack's
+// device.
+//
+// If 'conf.Network' is NoNetwork, skips local configuration and creates a
+// loopback interface only.
+//
+// Run the following container to test it:
+//  docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+	log.Infof("Setting up network")
+
+	// HACK!
+	//
+	// When kubernetes starts a pod, it first creates a sandbox with an
+	// application that just pauses forever.  Later, when a container is
+	// added to the pod, kubernetes will create another sandbox with a
+	// config that corresponds to the containerized application, and add it
+	// to the same namespaces as the pause sandbox.
+	//
+	// Running a second sandbox currently breaks because the two sandboxes
+	// have the same network namespace and configuration, and try to create
+	// a tap device on the same host device which fails.
+	//
+	// Runsc will eventually need to detect that this container is meant to
+	// be run in the same sandbox as the pausing application, and somehow
+	// make that happen.
+	//
+	// For now the following HACK disables networking for the "pause"
+	// sandbox, allowing the second sandbox to start up successfully.
+	//
+	// Cri-o helpfully adds the "ContainerType" annotation that we can use
+	// to detect whether we are a pod or container.  Cri-containerd will
+	// support this eventually, but does not currently
+	// (https://github.com/kubernetes-incubator/cri-containerd/issues/512).
+	//
+	// Thus, to support cri-containerd, we check if the exec args is
+	// "/pause", which is pretty gross.
+	//
+	// TODO: Remove this once multiple containers per sandbox
+	// is properly supported.
+	if spec.Annotations["io.kubernetes.cri-o.ContainerType"] == "sandbox" || spec.Process.Args[0] == "/pause" {
+		log.Warningf("HACK: Disabling network")
+		conf.Network = boot.NetworkNone
+	}
+
+	switch conf.Network {
+	case boot.NetworkNone:
+		log.Infof("Network is disabled, create loopback interface only")
+		if err := createDefaultLoopbackInterface(conn); err != nil {
+			return fmt.Errorf("error creating default loopback interface: %v", err)
+		}
+	case boot.NetworkSandbox:
+		// Build the path to the net namespace of the sandbox process.
+		// This is what we will copy.
+		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath); err != nil {
+			return fmt.Errorf("error creating interfaces from net namespace %q: %v", nsPath, err)
+		}
+	case boot.NetworkHost:
+		// Nothing to do here.
+	default:
+		return fmt.Errorf("Invalid network type: %d", conf.Network)
+	}
+	return nil
+}
+
+func createDefaultLoopbackInterface(conn *urpc.Client) error {
+	link := boot.LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []boot.Route{
+			{
+				Destination: net.IP("\x7f\x00\x00\x00"),
+				Mask:        net.IPMask("\xff\x00\x00\x00"),
+			},
+			{
+				Destination: net.IPv6loopback,
+				Mask:        net.IPMask(strings.Repeat("\xff", 16)),
+			},
+		},
+	}
+	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
+		LoopbackLinks: []boot.LoopbackLink{link},
+	}, nil); err != nil {
+		return fmt.Errorf("error creating loopback link and routes: %v", err)
+	}
+	return nil
+}
+
+func joinNetNS(nsPath string) (func(), error) {
+	runtime.LockOSThread()
+	restoreNS, err := applyNS(specs.LinuxNamespace{
+		Type: specs.NetworkNamespace,
+		Path: nsPath,
+	})
+	if err != nil {
+		runtime.UnlockOSThread()
+		return nil, fmt.Errorf("error joining net namespace %q: %v", nsPath, err)
+	}
+	return func() {
+		restoreNS()
+		runtime.UnlockOSThread()
+	}, nil
+}
+
+// isRootNS determines whether we are running in the root net namespace.
+//
+// TODO: Find a better way to detect root network.
+func isRootNS(ifaces []net.Interface) bool {
+	for _, iface := range ifaces {
+		if iface.Name == "docker0" {
+			return true
+		}
+	}
+	return false
+
+}
+
+// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
+// net namespace with the given path, creates them in the sandbox, and removes
+// them from the host.
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error {
+	// Join the network namespace that we will be copying.
+	restore, err := joinNetNS(nsPath)
+	if err != nil {
+		return err
+	}
+	defer restore()
+
+	// Get all interfaces in the namespace.
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return fmt.Errorf("error querying interfaces: %v", err)
+	}
+
+	if isRootNS(ifaces) {
+		return fmt.Errorf("cannot run in with network enabled in root network namespace")
+	}
+
+	// Collect addresses and routes from the interfaces.
+	var args boot.CreateLinksAndRoutesArgs
+	for _, iface := range ifaces {
+		if iface.Flags&net.FlagUp == 0 {
+			log.Infof("Skipping down interface: %+v", iface)
+			continue
+		}
+
+		ifaddrs, err := iface.Addrs()
+		if err != nil {
+			return fmt.Errorf("error fetching interface addresses for %q: %v", iface.Name, err)
+		}
+
+		// We build our own loopback devices.
+		if iface.Flags&net.FlagLoopback != 0 {
+			links, err := loopbackLinks(iface, ifaddrs)
+			if err != nil {
+				return fmt.Errorf("error getting loopback routes and links for iface %q: %v", iface.Name, err)
+			}
+			args.LoopbackLinks = append(args.LoopbackLinks, links...)
+			continue
+		}
+
+		// Get the link for the interface.
+		ifaceLink, err := netlink.LinkByName(iface.Name)
+		if err != nil {
+			return fmt.Errorf("error getting link for interface %q: %v", iface.Name, err)
+		}
+
+		// Create the socket.
+		const protocol = 0x0300 // htons(ETH_P_ALL)
+		fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+		if err != nil {
+			return fmt.Errorf("unable to create raw socket: %v", err)
+		}
+		deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+
+		// Bind to the appropriate device.
+		ll := syscall.SockaddrLinklayer{
+			Protocol: protocol,
+			Ifindex:  ifaceLink.Attrs().Index,
+			Hatype:   0, // No ARP type.
+			Pkttype:  syscall.PACKET_OTHERHOST,
+		}
+		if err := syscall.Bind(fd, &ll); err != nil {
+			return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+		}
+
+		// Scrape the routes before removing the address, since that
+		// will remove the routes as well.
+		routes, def, err := routesForIface(iface)
+		if err != nil {
+			return fmt.Errorf("error getting routes for interface %q: %v", iface.Name, err)
+		}
+		if def != nil {
+			if !args.DefaultGateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+			}
+			args.DefaultGateway.Route = *def
+			args.DefaultGateway.Name = iface.Name
+		}
+
+		link := boot.FDBasedLink{
+			Name:   iface.Name,
+			MTU:    iface.MTU,
+			Routes: routes,
+		}
+
+		// Collect the addresses for the interface, enable forwarding,
+		// and remove them from the host.
+		for _, ifaddr := range ifaddrs {
+			ipNet, ok := ifaddr.(*net.IPNet)
+			if !ok {
+				return fmt.Errorf("address is not IPNet: %t %+v", ifaddr, ifaddr)
+			}
+			link.Addresses = append(link.Addresses, ipNet.IP)
+
+			// Steal IP address from NIC.
+			if err := removeAddress(ifaceLink, ipNet.String()); err != nil {
+				return fmt.Errorf("error removing address %v from device %q: %v", iface.Name, ipNet, err)
+			}
+		}
+
+		args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
+		args.FDBasedLinks = append(args.FDBasedLinks, link)
+	}
+
+	log.Debugf("Setting up network, config: %+v", args)
+	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
+		return fmt.Errorf("error creating links and routes: %v", err)
+	}
+	return nil
+}
+
+// loopbackLinks collects the links for a loopback interface.
+func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
+	var links []boot.LoopbackLink
+	for _, addr := range addrs {
+		ipNet, ok := addr.(*net.IPNet)
+		if !ok {
+			return nil, fmt.Errorf("address is not IPNet: %t %+v", addr, addr)
+		}
+		links = append(links, boot.LoopbackLink{
+			Name:      iface.Name,
+			Addresses: []net.IP{ipNet.IP},
+			Routes: []boot.Route{{
+				Destination: ipNet.IP.Mask(ipNet.Mask),
+				Mask:        ipNet.Mask,
+			}},
+		})
+	}
+	return links, nil
+}
+
+// routesForIface iterates over all routes for the given interface and converts
+// them to boot.Routes.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+	link, err := netlink.LinkByIndex(iface.Index)
+	if err != nil {
+		return nil, nil, err
+	}
+	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
+	if err != nil {
+		return nil, nil, fmt.Errorf("error getting routes from %q: %v", iface.Name, err)
+	}
+
+	var def *boot.Route
+	var routes []boot.Route
+	for _, r := range rs {
+		// Is it a default route?
+		if r.Dst == nil {
+			if r.Gw == nil {
+				return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
+			}
+			if def != nil {
+				return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+			}
+			emptyAddr := net.IPv6zero
+			if r.Gw.To4() != nil {
+				emptyAddr = net.IPv4zero
+			}
+			// Create a catch all route to the gateway.
+			def = &boot.Route{
+				Destination: emptyAddr,
+				Mask:        net.IPMask(emptyAddr),
+				Gateway:     r.Gw,
+			}
+			continue
+		}
+		routes = append(routes, boot.Route{
+			Destination: r.Dst.IP.Mask(r.Dst.Mask),
+			Mask:        r.Dst.Mask,
+		})
+	}
+	return routes, def, nil
+}
+
+// removeAddress removes IP address from network device. It's equivalent to:
+//   ip addr del <ipAndMask> dev <name>
+func removeAddress(source netlink.Link, ipAndMask string) error {
+	addr, err := netlink.ParseAddr(ipAndMask)
+	if err != nil {
+		return err
+	}
+	return netlink.AddrDel(source, addr)
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
new file mode 100644
index 000000000..b2fa1d58e
--- /dev/null
+++ b/runsc/sandbox/sandbox.go
@@ -0,0 +1,666 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sandbox creates and manipulates sandboxes.
+package sandbox
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"strconv"
+	"syscall"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/control/client"
+	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/urpc"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+// metadataFilename is the name of the metadata file relative to sandboxRoot
+// that holds sandbox metadata.
+const metadataFilename = "meta.json"
+
+// See libcontainer/factory_linux.go
+var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
+
+// validateID validates the sandbox id.
+func validateID(id string) error {
+	if !idRegex.MatchString(id) {
+		return fmt.Errorf("invalid sandbox id: %v", id)
+	}
+	return nil
+}
+
+// Sandbox wraps a child sandbox process, and is responsible for saving and
+// loading sandbox metadata to disk.
+//
+// Within a root directory, we maintain subdirectories for each sandbox named
+// with the sandbox id.  The sandbox metadata is is stored as json within the
+// sandbox directoy in a file named "meta.json".  This metadata format is
+// defined by us, and is not part of the OCI spec.
+//
+// Sandboxes must write this metadata file after any change to their internal
+// state.  The entire sandbox directory is deleted when the sandbox is
+// destroyed.
+//
+// TODO: Protect against concurrent changes to the sandbox metadata
+// file.
+type Sandbox struct {
+	// ID is the sandbox ID.
+	ID string `json:"id"`
+
+	// Spec is the OCI runtime spec that configures this sandbox.
+	Spec *specs.Spec `json:"spec"`
+
+	// BundleDir is the directory containing the sandbox bundle.
+	BundleDir string `json:"bundleDir"`
+
+	// SandboxRoot is the directory containing the sandbox metadata file.
+	SandboxRoot string `json:"sandboxRoot"`
+
+	// CreatedAt is the time the sandbox was created.
+	CreatedAt time.Time `json:"createdAt"`
+
+	// Owner is the sandbox owner.
+	Owner string `json:"owner"`
+
+	// ConsoleSocket is the path to a unix domain socket that will receive
+	// the console FD.  It is only used during create, so we don't need to
+	// store it in the metadata.
+	ConsoleSocket string `json:"-"`
+
+	// Pid is the pid of the running sandbox.  Only valid if Status is
+	// Created or Running.
+	Pid int `json:"pid"`
+
+	// GoferPid is the pid of the gofer running along side the sandbox. May be 0
+	// if the gofer has been killed or it's not being used.
+	GoferPid int `json:"goferPid"`
+
+	// Status is the current sandbox Status.
+	Status Status `json:"status"`
+}
+
+// Create creates the sandbox subprocess and writes the metadata file.  Args
+// are additional arguments that will be passed to the sandbox process.
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (*Sandbox, error) {
+	log.Debugf("Create sandbox %q in root dir: %s", id, conf.RootDir)
+	if err := validateID(id); err != nil {
+		return nil, err
+	}
+
+	sandboxRoot := filepath.Join(conf.RootDir, id)
+	if exists(sandboxRoot) {
+		return nil, fmt.Errorf("sandbox with id %q already exists: %q ", id, sandboxRoot)
+	}
+
+	s := &Sandbox{
+		ID:            id,
+		Spec:          spec,
+		ConsoleSocket: consoleSocket,
+		BundleDir:     bundleDir,
+		SandboxRoot:   sandboxRoot,
+		Status:        Creating,
+		Owner:         os.Getenv("USER"),
+	}
+
+	// Create sandbox process. If anything errors between now and the end of this
+	// function, we MUST clean up all sandbox resources.
+	if err := s.createProcesses(conf, args); err != nil {
+		s.Destroy()
+		return nil, err
+	}
+
+	// Wait for the control server to come up (or timeout).  The sandbox is
+	// not "created" until that happens.
+	if err := s.waitForCreated(10 * time.Second); err != nil {
+		s.Destroy()
+		return nil, err
+	}
+
+	s.Status = Created
+	s.CreatedAt = time.Now()
+
+	// Save the metadata file.
+	if err := s.save(); err != nil {
+		s.Destroy()
+		return nil, err
+	}
+
+	// Write the pid file.  Containerd consideres the create complete after
+	// this file is created, so it must be the last thing we do.
+	if pidFile != "" {
+		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(s.Pid)), 0644); err != nil {
+			s.Destroy()
+			return nil, fmt.Errorf("error writing pid file: %v", err)
+		}
+	}
+
+	return s, nil
+}
+
+// Run is a helper that calls Create + Start + Wait.
+func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (syscall.WaitStatus, error) {
+	s, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, args)
+	if err != nil {
+		return 0, fmt.Errorf("error creating sandbox: %v", err)
+	}
+	if err := s.Start(conf); err != nil {
+		return 0, fmt.Errorf("error starting sandbox: %v", err)
+	}
+	return s.Wait()
+}
+
+// Load loads a sandbox from with the given id from a metadata file.
+func Load(rootDir, id string) (*Sandbox, error) {
+	log.Debugf("Load sandbox %q %q", rootDir, id)
+	if err := validateID(id); err != nil {
+		return nil, err
+	}
+	sandboxRoot := filepath.Join(rootDir, id)
+	if !exists(sandboxRoot) {
+		return nil, fmt.Errorf("sandbox with id %q does not exist", id)
+	}
+	metaFile := filepath.Join(sandboxRoot, metadataFilename)
+	if !exists(metaFile) {
+		return nil, fmt.Errorf("sandbox with id %q does not have metadata file %q", id, metaFile)
+	}
+	metaBytes, err := ioutil.ReadFile(metaFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading sandbox metadata file %q: %v", metaFile, err)
+	}
+	var s Sandbox
+	if err := json.Unmarshal(metaBytes, &s); err != nil {
+		return nil, fmt.Errorf("error unmarshaling sandbox metadata from %q: %v", metaFile, err)
+	}
+
+	// If the status is "Running" or "Created", check that the process
+	// still exists, and set it to Stopped if it does not.
+	//
+	// This is inherintly racey.
+	if s.Status == Running || s.Status == Created {
+		// Send signal 0 to check if process exists.
+		if err := s.Signal(0); err != nil {
+			// Process no longer exists.
+			s.Status = Stopped
+			s.Pid = 0
+		}
+	}
+
+	return &s, nil
+}
+
+// List returns all sandbox ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+	log.Debugf("List sandboxes %q", rootDir)
+	fs, err := ioutil.ReadDir(rootDir)
+	if err != nil {
+		return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err)
+	}
+	var out []string
+	for _, f := range fs {
+		out = append(out, f.Name())
+	}
+	return out, nil
+}
+
+// State returns the metadata of the sandbox.
+func (s *Sandbox) State() specs.State {
+	return specs.State{
+		Version: specs.Version,
+		ID:      s.ID,
+		Status:  s.Status.String(),
+		Pid:     s.Pid,
+		Bundle:  s.BundleDir,
+	}
+}
+
+// Start starts running the containerized process inside the sandbox.
+func (s *Sandbox) Start(conf *boot.Config) error {
+	log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid)
+	if s.Status != Created {
+		return fmt.Errorf("cannot start container in state %s", s.Status)
+	}
+
+	// "If any prestart hook fails, the runtime MUST generate an error,
+	// stop and destroy the container".
+	if s.Spec.Hooks != nil {
+		if err := executeHooks(s.Spec.Hooks.Prestart, s.State()); err != nil {
+			s.Destroy()
+			return err
+		}
+	}
+
+	c, err := s.connect()
+	if err != nil {
+		s.Destroy()
+		return err
+	}
+	defer c.Close()
+
+	// Configure the network.
+	if err := setupNetwork(c, s.Pid, s.Spec, conf); err != nil {
+		s.Destroy()
+		return fmt.Errorf("error setting up network: %v", err)
+	}
+
+	// Send a message to the sandbox control server to start the
+	// application.
+	if err := c.Call(boot.ApplicationStart, nil, nil); err != nil {
+		s.Destroy()
+		return fmt.Errorf("error starting sandbox: %v", err)
+	}
+
+	// "If any poststart hook fails, the runtime MUST log a warning, but
+	// the remaining hooks and lifecycle continue as if the hook had
+	// succeeded".
+	if s.Spec.Hooks != nil {
+		executeHooksBestEffort(s.Spec.Hooks.Poststart, s.State())
+	}
+
+	s.Status = Running
+	return s.save()
+}
+
+// Processes retrieves the list of processes and associated metadata inside a
+// sandbox.
+func (s *Sandbox) Processes() ([]*control.Process, error) {
+	if s.Status != Running {
+		return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", s.ID, s.Status)
+	}
+
+	c, err := s.connect()
+	if err != nil {
+		return nil, err
+	}
+	defer c.Close()
+
+	var pl []*control.Process
+	if err := c.Call(boot.ApplicationProcesses, nil, &pl); err != nil {
+		return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err)
+	}
+	return pl, nil
+}
+
+// Execute runs the specified command in the sandbox.
+func (s *Sandbox) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) {
+	log.Debugf("Execute in sandbox %q, pid: %d, args: %+v", s.ID, s.Pid, e)
+	if s.Status != Created && s.Status != Running {
+		return 0, fmt.Errorf("cannot exec in container in state %s", s.Status)
+	}
+
+	log.Debugf("Connecting to sandbox...")
+	c, err := s.connect()
+	if err != nil {
+		return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+	}
+	defer c.Close()
+
+	// Send a message to the sandbox control server to start the application.
+	var waitStatus uint32
+	if err := c.Call(boot.ApplicationExecute, e, &waitStatus); err != nil {
+		return 0, fmt.Errorf("error executing in sandbox: %v", err)
+	}
+
+	return syscall.WaitStatus(waitStatus), nil
+}
+
+// Event retrieves stats about the sandbox such as memory and CPU utilization.
+func (s *Sandbox) Event() (*boot.Event, error) {
+	if s.Status != Running && s.Status != Created {
+		return nil, fmt.Errorf("cannot get events for container in state: %s", s.Status)
+	}
+
+	c, err := s.connect()
+	if err != nil {
+		return nil, err
+	}
+	defer c.Close()
+
+	var e boot.Event
+	if err := c.Call(boot.ApplicationEvent, nil, &e); err != nil {
+		return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err)
+	}
+	e.ID = s.ID
+	return &e, nil
+}
+
+func (s *Sandbox) connect() (*urpc.Client, error) {
+	log.Debugf("Connecting to sandbox...")
+	c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
+	if err != nil {
+		return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+	}
+	return c, nil
+}
+
+func (s *Sandbox) createProcesses(conf *boot.Config, args []string) error {
+	binPath, err := specutils.BinPath()
+	if err != nil {
+		return err
+	}
+
+	ioFiles, err := s.createGoferProcess(conf, binPath, args)
+	if err != nil {
+		return err
+	}
+	return s.createSandboxProcess(conf, binPath, args, ioFiles)
+}
+
+func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonArgs []string) ([]*os.File, error) {
+	if conf.FileAccess != boot.FileAccessProxy {
+		// Don't start a gofer. The sandbox will access host FS directly.
+		return nil, nil
+	}
+
+	var args []string
+	args = append(args, commonArgs...)
+	args = append(args, "gofer", "--bundle", s.BundleDir)
+
+	// Start with root mount and then add any other additional mount.
+	mountCount := 1
+	for _, m := range s.Spec.Mounts {
+		if specutils.Is9PMount(m) {
+			mountCount++
+		}
+	}
+
+	sandEnds := make([]*os.File, 0, mountCount)
+	goferEnds := make([]*os.File, 0, mountCount)
+	for i := 0; i < mountCount; i++ {
+		// Create socket that connects the sandbox and gofer.
+		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0)
+		if err != nil {
+			return nil, err
+		}
+		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd"))
+
+		goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd")
+		defer goferEnd.Close()
+		goferEnds = append(goferEnds, goferEnd)
+
+		args = append(args, fmt.Sprintf("--io-fds=%d", 3+i))
+	}
+
+	cmd := exec.Command(binPath, args...)
+	cmd.ExtraFiles = goferEnds
+
+	// Setup any uid/gid mappings, and create or join the configured user
+	// namespace so the gofer's view of the filesystem aligns with the
+	// users in the sandbox.
+	setUIDGIDMappings(cmd, s.Spec)
+	nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, s.Spec)
+
+	// Start the gofer in the given namespace.
+	log.Debugf("Starting gofer: %s %v", binPath, args)
+	if err := startInNS(cmd, nss); err != nil {
+		return nil, err
+	}
+	s.GoferPid = cmd.Process.Pid
+	log.Infof("Gofer started, pid: %d", cmd.Process.Pid)
+	return sandEnds, nil
+}
+
+// createSandboxProcess starts the sandbox as a subprocess by running the "boot"
+// command, passing in the bundle dir.
+func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, commonArgs []string, ioFiles []*os.File) error {
+	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
+	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
+	nextFD := 3
+
+	// Create control server socket here and donate FD to child process because
+	// it may be in a different network namespace and won't be reachable from
+	// outside.
+	fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID))
+	if err != nil {
+		return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err)
+	}
+
+	consoleEnabled := s.ConsoleSocket != ""
+
+	cmd := exec.Command(binPath, commonArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{}
+	cmd.Args = append(cmd.Args,
+		"boot",
+		"--bundle", s.BundleDir,
+		"--controller-fd="+strconv.Itoa(nextFD),
+		fmt.Sprintf("--console=%t", consoleEnabled))
+	nextFD++
+
+	controllerFile := os.NewFile(uintptr(fd), "control_server_socket")
+	defer controllerFile.Close()
+	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
+
+	// If there is a gofer, sends all socket ends to the sandbox.
+	for _, f := range ioFiles {
+		defer f.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+		cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	// If the console control socket file is provided, then create a new
+	// pty master/slave pair and set the tty on the sandox process.
+	if consoleEnabled {
+		// setupConsole will send the master on the socket, and return
+		// the slave.
+		tty, err := setupConsole(s.ConsoleSocket)
+		if err != nil {
+			return fmt.Errorf("error setting up control socket %q: %v", s.ConsoleSocket, err)
+		}
+		defer tty.Close()
+
+		cmd.Stdin = tty
+		cmd.Stdout = tty
+		cmd.Stderr = tty
+		cmd.SysProcAttr.Setctty = true
+		cmd.SysProcAttr.Ctty = int(tty.Fd())
+	} else {
+		cmd.Stdin = os.Stdin
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+	}
+
+	// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
+	// when re-parented.
+	cmd.SysProcAttr.Setsid = true
+
+	// nss is the set of namespaces to join or create before starting the sandbox
+	// process. IPC and UTS namespaces from the host are not used as they
+	// are virtualized inside the sandbox. Be paranoid and run inside an empty
+	// namespace for these.
+	log.Infof("Sandbox will be started in empty IPC and UTS namespaces")
+	nss := []specs.LinuxNamespace{
+		specs.LinuxNamespace{Type: specs.IPCNamespace},
+		specs.LinuxNamespace{Type: specs.UTSNamespace},
+	}
+
+	if conf.Platform == boot.PlatformPtrace {
+		// TODO: Also set an empty PID namespace so that we limit
+		// access to other host processes.
+		log.Infof("Sandbox will be started in the current PID namespace")
+	} else {
+		log.Infof("Sandbox will be started in empty PID namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
+	}
+
+	if conf.FileAccess == boot.FileAccessProxy {
+		log.Infof("Sandbox will be started in empty mount namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace})
+	} else {
+		log.Infof("Sandbox will be started in the current mount namespace")
+	}
+
+	// Joins the network namespace if network is enabled. the sandbox talks
+	// directly to the host network, which may have been configured in the
+	// namespace.
+	if ns, ok := getNS(specs.NetworkNamespace, s.Spec); ok && conf.Network != boot.NetworkNone {
+		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
+		nss = append(nss, ns)
+	} else {
+		log.Infof("Sandbox will be started in empty network namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
+	}
+
+	// User namespace depends on the following options:
+	//   - Host network/filesystem: requires to run inside the user namespace
+	//       specified in the spec or the current namespace if none is configured.
+	//   - Gofer: when using a Gofer, the sandbox process can run isolated in an
+	//       empty namespace.
+	if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect {
+		if userns, ok := getNS(specs.UserNamespace, s.Spec); ok {
+			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
+			nss = append(nss, userns)
+			setUIDGIDMappings(cmd, s.Spec)
+		} else {
+			// TODO: Retrict capabilities since it's using current user
+			// namespace, i.e. root.
+			log.Infof("Sandbox will be started in the current user namespace")
+		}
+		// When running in the caller's defined user namespace, apply the same
+		// capabilities to the sandbox process to ensure it abides to the same
+		// rules.
+		cmd.Args = append(cmd.Args, "--apply-caps=true")
+
+	} else {
+		log.Infof("Sandbox will be started in empty user namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+	}
+
+	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
+	if err := startInNS(cmd, nss); err != nil {
+		return err
+	}
+	s.Pid = cmd.Process.Pid
+	log.Infof("Sandbox started, pid: %d", s.Pid)
+	return nil
+}
+
+// waitForCreated waits for the sandbox subprocess control server to be
+// running, at which point the sandbox is in Created state.
+func (s *Sandbox) waitForCreated(timeout time.Duration) error {
+	log.Debugf("Waiting for sandbox %q creation", s.ID)
+	tchan := time.After(timeout)
+	for {
+		select {
+		case <-tchan:
+			return fmt.Errorf("timed out waiting for sandbox control server")
+		default:
+			if c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)); err == nil {
+				// It's alive!
+				c.Close()
+				return nil
+			}
+		}
+	}
+}
+
+// Wait waits for the containerized process to exit, and returns its WaitStatus.
+func (s *Sandbox) Wait() (syscall.WaitStatus, error) {
+	log.Debugf("Wait on sandbox %q with pid %d", s.ID, s.Pid)
+	p, err := os.FindProcess(s.Pid)
+	if err != nil {
+		// "On Unix systems, FindProcess always succeeds and returns a
+		// Process for the given pid."
+		panic(err)
+	}
+	ps, err := p.Wait()
+	if err != nil {
+		return 0, err
+	}
+	return ps.Sys().(syscall.WaitStatus), nil
+}
+
+// Destroy frees all resources associated with the sandbox.
+func (s *Sandbox) Destroy() error {
+	log.Debugf("Destroy sandbox %q", s.ID)
+	if s.Pid != 0 {
+		// TODO: Too harsh?
+		log.Debugf("Killing sandbox %q", s.ID)
+		sendSignal(s.Pid, unix.SIGKILL)
+		s.Pid = 0
+	}
+	if s.GoferPid != 0 {
+		log.Debugf("Killing gofer for sandbox %q", s.ID)
+		sendSignal(s.GoferPid, unix.SIGKILL)
+		s.GoferPid = 0
+	}
+	if err := os.RemoveAll(s.SandboxRoot); err != nil {
+		log.Warningf("Failed to delete sandbox root directory %q, err: %v", s.SandboxRoot, err)
+	}
+
+	// "If any poststop hook fails, the runtime MUST log a warning, but the
+	// remaining hooks and lifecycle continue as if the hook had succeeded".
+	if s.Spec.Hooks != nil && (s.Status == Created || s.Status == Running) {
+		executeHooksBestEffort(s.Spec.Hooks.Poststop, s.State())
+	}
+
+	s.Status = Stopped
+	return nil
+}
+
+// Signal sends the signal to the sandbox.
+func (s *Sandbox) Signal(sig syscall.Signal) error {
+	log.Debugf("Signal sandbox %q", s.ID)
+	if s.Status == Stopped {
+		log.Warningf("sandbox %q not running, not sending signal %v to pid %d", s.ID, sig, s.Pid)
+		return nil
+	}
+	return sendSignal(s.Pid, sig)
+}
+
+func sendSignal(pid int, sig syscall.Signal) error {
+	if err := syscall.Kill(pid, sig); err != nil {
+		return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err)
+	}
+	return nil
+}
+
+// save saves the sandbox metadata to a file.
+func (s *Sandbox) save() error {
+	log.Debugf("Save sandbox %q", s.ID)
+	if err := os.MkdirAll(s.SandboxRoot, 0711); err != nil {
+		return fmt.Errorf("error creating sandbox root directory %q: %v", s.SandboxRoot, err)
+	}
+	meta, err := json.Marshal(s)
+	if err != nil {
+		return fmt.Errorf("error marshaling sandbox metadata: %v", err)
+	}
+	metaFile := filepath.Join(s.SandboxRoot, metadataFilename)
+	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
+		return fmt.Errorf("error writing sandbox metadata: %v", err)
+	}
+	return nil
+}
+
+// exists returns true if the given file exists.
+func exists(f string) bool {
+	if _, err := os.Stat(f); err == nil {
+		return true
+	} else if !os.IsNotExist(err) {
+		log.Warningf("error checking for file %q: %v", f, err)
+	}
+	return false
+}
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
new file mode 100644
index 000000000..6c71cac30
--- /dev/null
+++ b/runsc/sandbox/sandbox_test.go
@@ -0,0 +1,649 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox_test
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"reflect"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"context"
+	"flag"
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/unet"
+	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/cmd"
+	"gvisor.googlesource.com/gvisor/runsc/sandbox"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+}
+
+// writeSpec writes the spec to disk in the given directory.
+func writeSpec(dir string, spec *specs.Spec) error {
+	b, err := json.Marshal(spec)
+	if err != nil {
+		return err
+	}
+	return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755)
+}
+
+// newSpecWithArgs creates a simple spec with the given args suitable for use
+// in tests.
+func newSpecWithArgs(args ...string) *specs.Spec {
+	spec := &specs.Spec{
+		// The host filesystem root is the sandbox root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: args,
+			Env: []string{
+				"PATH=" + os.Getenv("PATH"),
+			},
+		},
+	}
+	return spec
+}
+
+// shutdownSignal will be sent to the sandbox in order to shut down cleanly.
+const shutdownSignal = syscall.SIGUSR2
+
+// setupSandbox creates a bundle and root dir for the sandbox, generates a test
+// config, and writes the spec to config.json in the bundle dir.
+func setupSandbox(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) {
+	rootDir, err = ioutil.TempDir("", "sandboxes")
+	if err != nil {
+		return "", "", nil, fmt.Errorf("error creating root dir: %v", err)
+	}
+
+	bundleDir, err = ioutil.TempDir("", "bundle")
+	if err != nil {
+		return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err)
+	}
+
+	if err = writeSpec(bundleDir, spec); err != nil {
+		return "", "", nil, fmt.Errorf("error writing spec: %v", err)
+	}
+
+	conf = &boot.Config{
+		RootDir: rootDir,
+		Network: boot.NetworkNone,
+	}
+
+	return rootDir, bundleDir, conf, nil
+}
+
+// uniqueSandboxID generates a unique sandbox id for each test.
+//
+// The sandbox id is used to create an abstract unix domain socket, which must
+// be unique.  While the sandbox forbids creating two sandboxes with the same
+// name, sometimes between test runs the socket does not get cleaned up quickly
+// enough, causing sandbox creation to fail.
+func uniqueSandboxID() string {
+	return fmt.Sprintf("test-sandbox-%d", time.Now().UnixNano())
+}
+
+// waitForProcessList waits for the given process list to show up in the sandbox.
+func waitForProcessList(s *sandbox.Sandbox, expected []*control.Process) error {
+	var got []*control.Process
+	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
+		var err error
+		got, err := s.Processes()
+		if err != nil {
+			return fmt.Errorf("error getting process data from sandbox: %v", err)
+		}
+		if procListsEqual(got, expected) {
+			return nil
+		}
+		// Process might not have started, try again...
+		time.Sleep(10 * time.Millisecond)
+	}
+	return fmt.Errorf("sandbox got process list: %s, want: %s", procListToString(got), procListToString(expected))
+}
+
+// TestLifecycle tests the basic Create/Start/Signal/Destory sandbox lifecycle.
+// It verifies after each step that the sandbox can be loaded from disk, and
+// has the correct status.
+func TestLifecycle(t *testing.T) {
+	// The sandbox will just sleep for a long time.  We will kill it before
+	// it finishes sleeping.
+	spec := newSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// expectedPL lists the expected process state of the sandbox.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+	// Create the sandbox.
+	id := uniqueSandboxID()
+	if _, err := sandbox.Create(id, spec, conf, bundleDir, "", "", nil); err != nil {
+		t.Fatalf("error creating sandbox: %v", err)
+	}
+	// Load the sandbox from disk and check the status.
+	s, err := sandbox.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading sandbox: %v", err)
+	}
+	if got, want := s.Status, sandbox.Created; got != want {
+		t.Errorf("sandbox status got %v, want %v", got, want)
+	}
+
+	// List should return the sandbox id.
+	ids, err := sandbox.List(rootDir)
+	if err != nil {
+		t.Fatalf("error listing sandboxes: %v", err)
+	}
+	if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) {
+		t.Errorf("sandbox list got %v, want %v", got, want)
+	}
+
+	// Start the sandbox.
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting sandbox: %v", err)
+	}
+	// Load the sandbox from disk and check the status.
+	s, err = sandbox.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading sandbox: %v", err)
+	}
+	if got, want := s.Status, sandbox.Running; got != want {
+		t.Errorf("sandbox status got %v, want %v", got, want)
+	}
+
+	// Verify that "sleep 100" is running.
+	if err := waitForProcessList(s, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Send the sandbox a signal, which we catch and use to cleanly
+	// shutdown.
+	if err := s.Signal(shutdownSignal); err != nil {
+		t.Fatalf("error sending signal %v to sandbox: %v", shutdownSignal, err)
+	}
+	// Wait for it to die.
+	if _, err := s.Wait(); err != nil {
+		t.Fatalf("error waiting on sandbox: %v", err)
+	}
+	// Load the sandbox from disk and check the status.
+	s, err = sandbox.Load(rootDir, id)
+	if err != nil {
+		t.Fatalf("error loading sandbox: %v", err)
+	}
+	if got, want := s.Status, sandbox.Stopped; got != want {
+		t.Errorf("sandbox status got %v, want %v", got, want)
+	}
+
+	// Destroy the sandbox.
+	if err := s.Destroy(); err != nil {
+		t.Fatalf("error destroying sandbox: %v", err)
+	}
+
+	// List should not return the sandbox id.
+	ids, err = sandbox.List(rootDir)
+	if err != nil {
+		t.Fatalf("error listing sandboxes: %v", err)
+	}
+	if len(ids) != 0 {
+		t.Errorf("expected sandbox list to be empty, but got %v", ids)
+	}
+
+	// Loading the sandbox by id should fail.
+	if _, err = sandbox.Load(rootDir, id); err == nil {
+		t.Errorf("expected loading destroyed sandbox to fail, but it did not")
+	}
+}
+
+// Test the we can execute the application with different path formats.
+func TestExePath(t *testing.T) {
+	for _, test := range []struct {
+		path    string
+		success bool
+	}{
+		{path: "true", success: true},
+		{path: "bin/true", success: true},
+		{path: "/bin/true", success: true},
+		{path: "thisfiledoesntexit", success: false},
+		{path: "bin/thisfiledoesntexit", success: false},
+		{path: "/bin/thisfiledoesntexit", success: false},
+	} {
+		spec := newSpecWithArgs(test.path)
+		rootDir, bundleDir, conf, err := setupSandbox(spec)
+		if err != nil {
+			t.Fatalf("exec: %s, error setting up sandbox: %v", test.path, err)
+		}
+
+		ws, err := sandbox.Run(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+
+		os.RemoveAll(rootDir)
+		os.RemoveAll(bundleDir)
+
+		if test.success {
+			if err != nil {
+				t.Errorf("exec: %s, error running sandbox: %v", test.path, err)
+			}
+			if ws.ExitStatus() != 0 {
+				t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0)
+			}
+		} else {
+			if err == nil {
+				t.Errorf("exec: %s, got: no error, want: error", test.path)
+			}
+		}
+	}
+}
+
+// Test the we can retrieve the application exit status from the sandbox.
+func TestAppExitStatus(t *testing.T) {
+	// First sandbox will succeed.
+	succSpec := newSpecWithArgs("true")
+
+	rootDir, bundleDir, conf, err := setupSandbox(succSpec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	ws, err := sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir, "", "", nil)
+	if err != nil {
+		t.Fatalf("error running sandbox: %v", err)
+	}
+	if ws.ExitStatus() != 0 {
+		t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0)
+	}
+
+	// Second sandbox exits with non-zero status.
+	wantStatus := 123
+	errSpec := newSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
+
+	rootDir2, bundleDir2, conf, err := setupSandbox(errSpec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir2)
+	defer os.RemoveAll(bundleDir2)
+
+	ws, err = sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir2, "", "", nil)
+	if err != nil {
+		t.Fatalf("error running sandbox: %v", err)
+	}
+	if ws.ExitStatus() != wantStatus {
+		t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus)
+	}
+}
+
+// TestExec verifies that a sandbox can exec a new program.
+func TestExec(t *testing.T) {
+	const uid = 343
+	spec := newSpecWithArgs("sleep", "100")
+
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the sandbox.
+	s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+	if err != nil {
+		t.Fatalf("error creating sandbox: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting sandbox: %v", err)
+	}
+
+	// expectedPL lists the expected process state of the sandbox.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  uid,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Verify that "sleep 100" is running.
+	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		t.Error(err)
+	}
+
+	execArgs := control.ExecArgs{
+		Filename:         "/bin/sleep",
+		Argv:             []string{"sleep", "5"},
+		Envv:             []string{"PATH=" + os.Getenv("PATH")},
+		WorkingDirectory: "/",
+		KUID:             uid,
+		Detach:           false,
+	}
+
+	// Verify that "sleep 100" and "sleep 5" are running after exec.
+	// First, start running exec (whick blocks).
+	status := make(chan error, 1)
+	go func() {
+		exitStatus, err := s.Execute(&execArgs)
+		if err != nil {
+			status <- err
+		} else if exitStatus != 0 {
+			status <- fmt.Errorf("failed with exit status: %v", exitStatus)
+		} else {
+			status <- nil
+		}
+	}()
+
+	if err := waitForProcessList(s, expectedPL); err != nil {
+		t.Fatal(err)
+	}
+
+	// Ensure that exec finished without error.
+	select {
+	case <-time.After(10 * time.Second):
+		t.Fatalf("sandbox timed out waiting for exec to finish.")
+	case st := <-status:
+		if st != nil {
+			t.Errorf("sandbox failed to exec %v: %v", execArgs, err)
+		}
+	}
+}
+
+// TestCapabilities verifies that:
+// - Running exec as non-root UID and GID will result in an error (because the
+//   executable file can't be read).
+// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips
+//   this check.
+func TestCapabilities(t *testing.T) {
+	const uid = 343
+	const gid = 2401
+	spec := newSpecWithArgs("sleep", "100")
+
+	// We generate files in the host temporary directory.
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Destination: os.TempDir(),
+		Source:      os.TempDir(),
+		Type:        "bind",
+	})
+
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create and start the sandbox.
+	s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil)
+	if err != nil {
+		t.Fatalf("error creating sandbox: %v", err)
+	}
+	defer s.Destroy()
+	if err := s.Start(conf); err != nil {
+		t.Fatalf("error starting sandbox: %v", err)
+	}
+
+	// expectedPL lists the expected process state of the sandbox.
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  uid,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "exe",
+		},
+	}
+	if err := waitForProcessList(s, expectedPL[:1]); err != nil {
+		t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+	}
+
+	// Create an executable that can't be run with the specified UID:GID.
+	// This shouldn't be callable within the sandbox until we add the
+	// CAP_DAC_OVERRIDE capability to skip the access check.
+	exePath := filepath.Join(rootDir, "exe")
+	if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+		t.Fatalf("couldn't create executable: %v", err)
+	}
+	defer os.Remove(exePath)
+
+	// Need to traverse the intermediate directory.
+	os.Chmod(rootDir, 0755)
+
+	execArgs := control.ExecArgs{
+		Filename:         exePath,
+		Argv:             []string{exePath},
+		Envv:             []string{"PATH=" + os.Getenv("PATH")},
+		WorkingDirectory: "/",
+		KUID:             uid,
+		KGID:             gid,
+		Capabilities:     &auth.TaskCapabilities{},
+		Detach:           true,
+	}
+
+	// "exe" should fail because we don't have the necessary permissions.
+	if _, err := s.Execute(&execArgs); err == nil {
+		t.Fatalf("sandbox executed without error, but an error was expected")
+	}
+
+	// Now we run with the capability enabled and should succeed.
+	execArgs.Capabilities = &auth.TaskCapabilities{
+		EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+	}
+	// First, start running exec.
+	if _, err := s.Execute(&execArgs); err != nil {
+		t.Fatalf("sandbox failed to exec %v: %v", execArgs, err)
+	}
+
+	if err := waitForProcessList(s, expectedPL); err != nil {
+		t.Error(err)
+	}
+}
+
+// Test that an tty FD is sent over the console socket if one is provided.
+func TestConsoleSocket(t *testing.T) {
+	spec := newSpecWithArgs("true")
+	rootDir, bundleDir, conf, err := setupSandbox(spec)
+	if err != nil {
+		t.Fatalf("error setting up sandbox: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+	defer os.RemoveAll(bundleDir)
+
+	// Create a named socket and start listening.  We use a relative path
+	// to avoid overflowing the unix path length limit (108 chars).
+	socketPath := filepath.Join(bundleDir, "socket")
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatalf("error getting cwd: %v", err)
+	}
+	socketRelPath, err := filepath.Rel(cwd, socketPath)
+	if err != nil {
+		t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err)
+	}
+	if len(socketRelPath) > len(socketPath) {
+		socketRelPath = socketPath
+	}
+	srv, err := unet.BindAndListen(socketRelPath, false)
+	if err != nil {
+		t.Fatalf("error binding and listening to socket %q: %v", socketPath, err)
+	}
+	defer os.Remove(socketPath)
+
+	// Create the sandbox and pass the socket name.
+	id := uniqueSandboxID()
+	s, err := sandbox.Create(id, spec, conf, bundleDir, socketRelPath, "", nil)
+	if err != nil {
+		t.Fatalf("error creating sandbox: %v", err)
+	}
+
+	// Open the othe end of the socket.
+	sock, err := srv.Accept()
+	if err != nil {
+		t.Fatalf("error accepting socket connection: %v", err)
+	}
+
+	// Allow 3 fds to be received.  We only expect 1.
+	r := sock.Reader(true /* blocking */)
+	r.EnableFDs(1)
+
+	// The socket is closed right after sending the FD, so EOF is
+	// an allowed error.
+	b := [][]byte{{}}
+	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+		t.Fatalf("error reading from socket connection: %v", err)
+	}
+
+	// We should have gotten a control message.
+	fds, err := r.ExtractFDs()
+	if err != nil {
+		t.Fatalf("error extracting fds from socket connection: %v", err)
+	}
+	if len(fds) != 1 {
+		t.Fatalf("got %d fds from socket, wanted 1", len(fds))
+	}
+
+	// Verify that the fd is a terminal.
+	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+		t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+	}
+
+	// Shut it down.
+	if err := s.Destroy(); err != nil {
+		t.Fatalf("error destroying sandbox: %v", err)
+	}
+
+	// Close socket.
+	if err := srv.Close(); err != nil {
+		t.Fatalf("error destroying sandbox: %v", err)
+	}
+}
+
+// procListsEqual is used to check whether 2 Process lists are equal for all
+// implemented fields.
+func procListsEqual(got, want []*control.Process) bool {
+	if len(got) != len(want) {
+		return false
+	}
+	for i := range got {
+		pd1 := got[i]
+		pd2 := want[i]
+		// Zero out unimplemented and timing dependant fields.
+		pd1.Time, pd2.Time = "", ""
+		pd1.STime, pd2.STime = "", ""
+		pd1.C, pd2.C = 0, 0
+		if *pd1 != *pd2 {
+			return false
+		}
+	}
+	return true
+}
+
+func procListToString(pl []*control.Process) string {
+	strs := make([]string, 0, len(pl))
+	for _, p := range pl {
+		strs = append(strs, fmt.Sprintf("%+v", p))
+	}
+	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
+}
+
+// TestMain acts like runsc if it is called with the "boot" argument, otherwise
+// it just runs the tests.  This is required because creating a sandbox will
+// call "/proc/self/exe boot".  Normally /proc/self/exe is the runsc binary,
+// but for tests we have to fake it.
+func TestMain(m *testing.M) {
+	// exit writes coverage data before exiting.
+	exit := func(status int) {
+		os.Exit(status)
+	}
+
+	if !flag.Parsed() {
+		flag.Parse()
+	}
+
+	// If we are passed one of the commands then run it.
+	subcommands.Register(new(cmd.Boot), "boot")
+	subcommands.Register(new(cmd.Gofer), "gofer")
+	switch flag.Arg(0) {
+	case "boot", "gofer":
+		// Run the command in a goroutine so we can block the main
+		// thread waiting for shutdownSignal.
+		go func() {
+			conf := &boot.Config{
+				RootDir: "unused-root-dir",
+				Network: boot.NetworkNone,
+			}
+			var ws syscall.WaitStatus
+			subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+			if subcmdCode != subcommands.ExitSuccess {
+				panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode))
+			}
+			// Sandbox exited normally. Shut down this process.
+			os.Exit(ws.ExitStatus())
+		}()
+
+		// Shutdown cleanly when the shutdownSignal is received.  This
+		// allows us to write coverage data before exiting.
+		sigc := make(chan os.Signal, 1)
+		signal.Notify(sigc, shutdownSignal)
+		<-sigc
+		exit(0)
+	default:
+		// Otherwise run the tests.
+		exit(m.Run())
+	}
+}
diff --git a/runsc/sandbox/status.go b/runsc/sandbox/status.go
new file mode 100644
index 000000000..6fc936aba
--- /dev/null
+++ b/runsc/sandbox/status.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+// Status enumerates sandbox statuses.  The statuses and their semantics are
+// part of the runtime CLI spec.
+//
+// TODO: Get precise about the transitions between statuses.
+type Status int
+
+const (
+	// Creating indicates "the container is being created".
+	Creating Status = iota
+
+	// Created indicates "the runtime has finished the create operation and
+	// the container process has neither exited nor executed the
+	// user-specified program".
+	Created
+
+	// Running indicates "the container process has executed the
+	// user-specified program but has not exited".
+	Running
+
+	// Stopped indicates "the container process has exited".
+	Stopped
+)
+
+// String converts a Status to a string.  These strings are part of the runtime
+// CLI spec and should not be changed.
+func (s Status) String() string {
+	switch s {
+	case Creating:
+		return "creating"
+	case Created:
+		return "created"
+	case Running:
+		return "running"
+	case Stopped:
+		return "stopped"
+	default:
+		return "unknown"
+	}
+
+}
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
new file mode 100644
index 000000000..ae89260d2
--- /dev/null
+++ b/runsc/specutils/BUILD
@@ -0,0 +1,18 @@
+package(licenses = ["notice"])  # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "specutils",
+    srcs = ["specutils.go"],
+    importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/kernel/auth",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
new file mode 100644
index 000000000..bed0f75eb
--- /dev/null
+++ b/runsc/specutils/specutils.go
@@ -0,0 +1,183 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package specutils contains utility functions for working with OCI runtime
+// specs.
+package specutils
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// LogSpec logs the spec in a human-friendly way.
+func LogSpec(spec *specs.Spec) {
+	log.Debugf("Spec: %+v", spec)
+	log.Debugf("Spec.Hooks: %+v", spec.Hooks)
+	log.Debugf("Spec.Linux: %+v", spec.Linux)
+	log.Debugf("Spec.Process: %+v", spec.Process)
+	log.Debugf("Spec.Root: %+v", spec.Root)
+}
+
+// ReadSpec reads an OCI runtime spec from the given bundle directory.
+//
+// TODO: This should validate the spec.
+func ReadSpec(bundleDir string) (*specs.Spec, error) {
+	// The spec file must be in "config.json" inside the bundle directory.
+	specFile := filepath.Join(bundleDir, "config.json")
+	specBytes, err := ioutil.ReadFile(specFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile, err)
+	}
+	var spec specs.Spec
+	if err := json.Unmarshal(specBytes, &spec); err != nil {
+		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile, err, string(specBytes))
+	}
+	return &spec, nil
+}
+
+// GetExecutablePath returns the absolute path to the executable, relative to
+// the root.  It searches the environment PATH for the first file that exists
+// with the given name.
+func GetExecutablePath(exec, root string, env []string) (string, error) {
+	exec = filepath.Clean(exec)
+
+	// Don't search PATH if exec is a path to a file (absolute or relative).
+	if strings.IndexByte(exec, '/') >= 0 {
+		return exec, nil
+	}
+
+	// Get the PATH from the environment.
+	const prefix = "PATH="
+	var path []string
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			path = strings.Split(strings.TrimPrefix(e, prefix), ":")
+			break
+		}
+	}
+
+	// Search the PATH for a file whose name matches the one we are looking
+	// for.
+	for _, p := range path {
+		abs := filepath.Join(root, p, exec)
+		if _, err := os.Stat(abs); err == nil {
+			// We found it!  Return the path relative to the root.
+			return filepath.Join("/", p, exec), nil
+		}
+	}
+
+	// Could not find a suitable path, just return the original string.
+	log.Warningf("could not find executable %s in path %s", exec, path)
+	return exec, nil
+}
+
+// Capabilities takes in spec and returns a TaskCapabilities corresponding to
+// the spec.
+func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+	var caps auth.TaskCapabilities
+	if specCaps != nil {
+		var err error
+		if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding); err != nil {
+			return nil, err
+		}
+		if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective); err != nil {
+			return nil, err
+		}
+		if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable); err != nil {
+			return nil, err
+		}
+		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted); err != nil {
+			return nil, err
+		}
+		// TODO: Support ambient capabilities.
+	}
+	return &caps, nil
+}
+
+var capFromName = map[string]linux.Capability{
+	"CAP_CHOWN":            linux.CAP_CHOWN,
+	"CAP_DAC_OVERRIDE":     linux.CAP_DAC_OVERRIDE,
+	"CAP_DAC_READ_SEARCH":  linux.CAP_DAC_READ_SEARCH,
+	"CAP_FOWNER":           linux.CAP_FOWNER,
+	"CAP_FSETID":           linux.CAP_FSETID,
+	"CAP_KILL":             linux.CAP_KILL,
+	"CAP_SETGID":           linux.CAP_SETGID,
+	"CAP_SETUID":           linux.CAP_SETUID,
+	"CAP_SETPCAP":          linux.CAP_SETPCAP,
+	"CAP_LINUX_IMMUTABLE":  linux.CAP_LINUX_IMMUTABLE,
+	"CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE,
+	"CAP_NET_BROAD_CAST":   linux.CAP_NET_BROAD_CAST,
+	"CAP_NET_ADMIN":        linux.CAP_NET_ADMIN,
+	"CAP_NET_RAW":          linux.CAP_NET_RAW,
+	"CAP_IPC_LOCK":         linux.CAP_IPC_LOCK,
+	"CAP_IPC_OWNER":        linux.CAP_IPC_OWNER,
+	"CAP_SYS_MODULE":       linux.CAP_SYS_MODULE,
+	"CAP_SYS_RAWIO":        linux.CAP_SYS_RAWIO,
+	"CAP_SYS_CHROOT":       linux.CAP_SYS_CHROOT,
+	"CAP_SYS_PTRACE":       linux.CAP_SYS_PTRACE,
+	"CAP_SYS_PACCT":        linux.CAP_SYS_PACCT,
+	"CAP_SYS_ADMIN":        linux.CAP_SYS_ADMIN,
+	"CAP_SYS_BOOT":         linux.CAP_SYS_BOOT,
+	"CAP_SYS_NICE":         linux.CAP_SYS_NICE,
+	"CAP_SYS_RESOURCE":     linux.CAP_SYS_RESOURCE,
+	"CAP_SYS_TIME":         linux.CAP_SYS_TIME,
+	"CAP_SYS_TTY_CONFIG":   linux.CAP_SYS_TTY_CONFIG,
+	"CAP_MKNOD":            linux.CAP_MKNOD,
+	"CAP_LEASE":            linux.CAP_LEASE,
+	"CAP_AUDIT_WRITE":      linux.CAP_AUDIT_WRITE,
+	"CAP_AUDIT_CONTROL":    linux.CAP_AUDIT_CONTROL,
+	"CAP_SETFCAP":          linux.CAP_SETFCAP,
+	"CAP_MAC_OVERRIDE":     linux.CAP_MAC_OVERRIDE,
+	"CAP_MAC_ADMIN":        linux.CAP_MAC_ADMIN,
+	"CAP_SYSLOG":           linux.CAP_SYSLOG,
+	"CAP_WAKE_ALARM":       linux.CAP_WAKE_ALARM,
+	"CAP_BLOCK_SUSPEND":    linux.CAP_BLOCK_SUSPEND,
+}
+
+func capsFromNames(names []string) (auth.CapabilitySet, error) {
+	var caps []linux.Capability
+	for _, n := range names {
+		c, ok := capFromName[n]
+		if !ok {
+			return 0, fmt.Errorf("unknown capability %q", n)
+		}
+		caps = append(caps, c)
+	}
+	return auth.CapabilitySetOfMany(caps), nil
+}
+
+// Is9PMount returns true if the given mount can be mounted as an external gofer.
+func Is9PMount(m specs.Mount) bool {
+	return m.Type == "bind" && m.Source != "" && !strings.HasPrefix(m.Destination, "/dev")
+}
+
+// BinPath returns the real path to self, resolving symbolink links. This is done
+// to make the process name appears as 'runsc', instead of 'exe'.
+func BinPath() (string, error) {
+	binPath, err := filepath.EvalSymlinks("/proc/self/exe")
+	if err != nil {
+		return "", fmt.Errorf(`error resolving "/proc/self/exe" symlink: %v`, err)
+	}
+	return binPath, nil
+}
author	Googler <noreply@google.com>	2018-04-27 10:37:02 -0700
committer	Adin Scannell <ascannell@google.com>	2018-04-28 01:44:26 -0400
commit	d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree	54f95eef73aee6bacbfc736fffc631be2605ed53 /runsc
parent	f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)