112 files changed, 25385 insertions, 0 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
new file mode 100644
index 000000000..757f6d44c
--- /dev/null
+++ b/runsc/BUILD
@@ -0,0 +1,123 @@
+load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "runsc",
+    srcs = [
+        "main.go",
+        "version.go",
+    ],
+    pure = True,
+    visibility = [
+        "//visibility:public",
+    ],
+    x_defs = {"main.version": "{STABLE_VERSION}"},
+    deps = [
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/platform",
+        "//runsc/boot",
+        "//runsc/cmd",
+        "//runsc/flag",
+        "//runsc/specutils",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
+)
+
+# The runsc-race target is a race-compatible BUILD target. This must be built
+# via: bazel build --features=race :runsc-race
+#
+# This is neccessary because the race feature must apply to all dependencies
+# due a bug in gazelle file selection.  The pure attribute must be off because
+# the race detector requires linking with non-Go components, although we still
+# require a static binary.
+#
+# Note that in the future this might be convertible to a compatible target by
+# using the pure and static attributes within a select function, but select is
+# not currently compatible with string attributes [1].
+#
+# [1] https://github.com/bazelbuild/bazel/issues/1698
+go_binary(
+    name = "runsc-race",
+    srcs = [
+        "main.go",
+        "version.go",
+    ],
+    static = True,
+    visibility = [
+        "//visibility:public",
+    ],
+    x_defs = {"main.version": "{STABLE_VERSION}"},
+    deps = [
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/platform",
+        "//runsc/boot",
+        "//runsc/cmd",
+        "//runsc/flag",
+        "//runsc/specutils",
+        "@com_github_google_subcommands//:go_default_library",
+    ],
+)
+
+pkg_tar(
+    name = "runsc-bin",
+    srcs = [":runsc"],
+    mode = "0755",
+    package_dir = "/usr/bin",
+    strip_prefix = "/runsc/linux_amd64_pure_stripped",
+)
+
+pkg_tar(
+    name = "debian-data",
+    extension = "tar.gz",
+    deps = [
+        ":runsc-bin",
+    ],
+)
+
+genrule(
+    name = "deb-version",
+    # Note that runsc must appear in the srcs parameter and not the tools
+    # parameter, otherwise it will not be stamped. This is reasonable, as tools
+    # may be encoded differently in the build graph (cached more aggressively
+    # because they are assumes to be hermetic).
+    srcs = [":runsc"],
+    outs = ["version.txt"],
+    # Note that the little dance here is necessary because files in the $(SRCS)
+    # attribute are not executable by default, and we can't touch in place.
+    cmd = "cp $(location :runsc) $(@D)/runsc && \
+        chmod a+x $(@D)/runsc && \
+        $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \
+        rm -f $(@D)/runsc",
+    stamp = 1,
+)
+
+pkg_deb(
+    name = "runsc-debian",
+    architecture = "amd64",
+    data = ":debian-data",
+    # Note that the description_file will be flatten (all newlines removed),
+    # and therefore it is kept to a simple one-line description. The expected
+    # format for debian packages is "short summary\nLonger explanation of
+    # tool." and this is impossible with the flattening.
+    description_file = "debian/description",
+    homepage = "https://gvisor.dev/",
+    maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
+    package = "runsc",
+    postinst = "debian/postinst.sh",
+    version_file = ":version.txt",
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
+sh_test(
+    name = "version_test",
+    size = "small",
+    srcs = ["version_test.sh"],
+    args = ["$(location :runsc)"],
+    data = [":runsc"],
+    tags = ["noguitar"],
+)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
new file mode 100644
index 000000000..a907c103b
--- /dev/null
+++ b/runsc/boot/BUILD
@@ -0,0 +1,133 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "boot",
+    srcs = [
+        "compat.go",
+        "compat_amd64.go",
+        "compat_arm64.go",
+        "config.go",
+        "controller.go",
+        "debug.go",
+        "events.go",
+        "fs.go",
+        "limits.go",
+        "loader.go",
+        "network.go",
+        "strace.go",
+        "vfs.go",
+    ],
+    visibility = [
+        "//pkg/test:__subpackages__",
+        "//runsc:__subpackages__",
+        "//test:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/control/server",
+        "//pkg/cpuid",
+        "//pkg/eventchannel",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/memutil",
+        "//pkg/rand",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/arch:registers_go_proto",
+        "//pkg/sentry/control",
+        "//pkg/sentry/devices/memdev",
+        "//pkg/sentry/fdimport",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/dev",
+        "//pkg/sentry/fs/gofer",
+        "//pkg/sentry/fs/host",
+        "//pkg/sentry/fs/proc",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fs/sys",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/fs/tty",
+        "//pkg/sentry/fs/user",
+        "//pkg/sentry/fsimpl/devpts",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/gofer",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/fsimpl/proc",
+        "//pkg/sentry/fsimpl/sys",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel:uncaught_signal_go_proto",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/sighandling",
+        "//pkg/sentry/socket/hostinet",
+        "//pkg/sentry/socket/netlink",
+        "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/netlink/uevent",
+        "//pkg/sentry/socket/netstack",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/state",
+        "//pkg/sentry/strace",
+        "//pkg/sentry/syscalls/linux/vfs2",
+        "//pkg/sentry/time",
+        "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/sentry/watchdog",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/qdisc/fifo",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/arp",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/raw",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/urpc",
+        "//runsc/boot/filter",
+        "//runsc/boot/platforms",
+        "//runsc/boot/pprof",
+        "//runsc/specutils",
+        "@com_github_golang_protobuf//proto:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "boot_test",
+    size = "small",
+    srcs = [
+        "compat_test.go",
+        "fs_test.go",
+        "loader_test.go",
+    ],
+    library = ":boot",
+    deps = [
+        "//pkg/control/server",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/unet",
+        "//runsc/fsgofer",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
new file mode 100644
index 000000000..b7cfb35bf
--- /dev/null
+++ b/runsc/boot/compat.go
@@ -0,0 +1,196 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+
+	"github.com/golang/protobuf/proto"
+	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/log"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+	spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func initCompatLogs(fd int) error {
+	ce, err := newCompatEmitter(fd)
+	if err != nil {
+		return err
+	}
+	eventchannel.AddEmitter(ce)
+	return nil
+}
+
+type compatEmitter struct {
+	sink    *log.BasicLogger
+	nameMap strace.SyscallMap
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// trackers map syscall number to the respective tracker instance.
+	// Protected by 'mu'.
+	trackers map[uint64]syscallTracker
+}
+
+func newCompatEmitter(logFD int) (*compatEmitter, error) {
+	nameMap, ok := getSyscallNameMap()
+	if !ok {
+		return nil, fmt.Errorf("Linux syscall table not found")
+	}
+
+	c := &compatEmitter{
+		// Always logs to default logger.
+		sink:     log.Log(),
+		nameMap:  nameMap,
+		trackers: make(map[uint64]syscallTracker),
+	}
+
+	if logFD > 0 {
+		f := os.NewFile(uintptr(logFD), "user log file")
+		target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}}
+		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
+	}
+	return c, nil
+}
+
+// Emit implements eventchannel.Emitter.
+func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
+	switch m := msg.(type) {
+	case *spb.UnimplementedSyscall:
+		c.emitUnimplementedSyscall(m)
+	case *ucspb.UncaughtSignal:
+		c.emitUncaughtSignal(m)
+	}
+
+	return false, nil
+}
+
+func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
+	regs := us.Registers
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	sysnr := syscallNum(regs)
+	tr := c.trackers[sysnr]
+	if tr == nil {
+		switch sysnr {
+		case syscall.SYS_PRCTL:
+			// args: cmd, ...
+			tr = newArgsTracker(0)
+
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX, syscall.SYS_FALLOCATE:
+			// args: fd/addr, cmd, ...
+			tr = newArgsTracker(1)
+
+		case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
+			// args: fd, level, name, ...
+			tr = newArgsTracker(1, 2)
+
+		case syscall.SYS_SEMCTL:
+			// args: semid, semnum, cmd, ...
+			tr = newArgsTracker(2)
+
+		default:
+			tr = newArchArgsTracker(sysnr)
+			if tr == nil {
+				tr = &onceTracker{}
+			}
+		}
+		c.trackers[sysnr] = tr
+	}
+
+	if tr.shouldReport(regs) {
+		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+		tr.onReported(regs)
+	}
+}
+
+func (c *compatEmitter) emitUncaughtSignal(msg *ucspb.UncaughtSignal) {
+	sig := syscall.Signal(msg.SignalNumber)
+	c.sink.Infof(
+		"Uncaught signal: %q (%d), PID: %d, TID: %d, fault addr: %#x",
+		sig, msg.SignalNumber, msg.Pid, msg.Tid, msg.FaultAddr)
+}
+
+// Close implements eventchannel.Emitter.
+func (c *compatEmitter) Close() error {
+	c.sink = nil
+	return nil
+}
+
+// syscallTracker interface allows filters to apply differently depending on
+// the syscall and arguments.
+type syscallTracker interface {
+	// shouldReport returns true is the syscall should be reported.
+	shouldReport(regs *rpb.Registers) bool
+
+	// onReported marks the syscall as reported.
+	onReported(regs *rpb.Registers)
+}
+
+// onceTracker reports only a single time, used for most syscalls.
+type onceTracker struct {
+	reported bool
+}
+
+func (o *onceTracker) shouldReport(_ *rpb.Registers) bool {
+	return !o.reported
+}
+
+func (o *onceTracker) onReported(_ *rpb.Registers) {
+	o.reported = true
+}
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+	// argsIdx is the syscall arguments to use as unique ID.
+	argsIdx  []int
+	reported map[string]struct{}
+	count    int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// key returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.Registers) string {
+	var rv string
+	for _, idx := range a.argsIdx {
+		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	}
+	return rv
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.Registers) bool {
+	if a.count >= reportLimit {
+		return false
+	}
+	_, ok := a.reported[a.key(regs)]
+	return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.Registers) {
+	a.count++
+	a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
new file mode 100644
index 000000000..42b0ca8b0
--- /dev/null
+++ b/runsc/boot/compat_amd64.go
@@ -0,0 +1,96 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Amd64{
+			Amd64: &rpb.AMD64Registers{},
+		},
+	}
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
+	switch argIdx {
+	case 0:
+		return uint32(amd64Regs.Rdi)
+	case 1:
+		return uint32(amd64Regs.Rsi)
+	case 2:
+		return uint32(amd64Regs.Rdx)
+	case 3:
+		return uint32(amd64Regs.R10)
+	case 4:
+		return uint32(amd64Regs.R8)
+	case 5:
+		return uint32(amd64Regs.R9)
+	}
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
+	switch argIdx {
+	case 0:
+		amd64Regs.Rdi = argVal
+	case 1:
+		amd64Regs.Rsi = argVal
+	case 2:
+		amd64Regs.Rdx = argVal
+	case 3:
+		amd64Regs.R10 = argVal
+	case 4:
+		amd64Regs.R8 = argVal
+	case 5:
+		amd64Regs.R9 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+	}
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.AMD64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+	return amd64Regs.OrigRax
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	switch sysnr {
+	case syscall.SYS_ARCH_PRCTL:
+		// args: cmd, ...
+		return newArgsTracker(0)
+	}
+	return nil
+}
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
new file mode 100644
index 000000000..f784cd237
--- /dev/null
+++ b/runsc/boot/compat_arm64.go
@@ -0,0 +1,91 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Arm64{
+			Arm64: &rpb.ARM64Registers{},
+		},
+	}
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		return uint32(arm64Regs.R0)
+	case 1:
+		return uint32(arm64Regs.R1)
+	case 2:
+		return uint32(arm64Regs.R2)
+	case 3:
+		return uint32(arm64Regs.R3)
+	case 4:
+		return uint32(arm64Regs.R4)
+	case 5:
+		return uint32(arm64Regs.R5)
+	}
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		arm64Regs.R0 = argVal
+	case 1:
+		arm64Regs.R1 = argVal
+	case 2:
+		arm64Regs.R2 = argVal
+	case 3:
+		arm64Regs.R3 = argVal
+	case 4:
+		arm64Regs.R4 = argVal
+	case 5:
+		arm64Regs.R5 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+	}
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.ARM64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+	return arm64Regs.R8
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	// currently, no arch specific syscalls need to be handled here.
+	return nil
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
new file mode 100644
index 000000000..839c5303b
--- /dev/null
+++ b/runsc/boot/compat_test.go
@@ -0,0 +1,90 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"testing"
+)
+
+func TestOnceTracker(t *testing.T) {
+	o := onceTracker{}
+	if !o.shouldReport(nil) {
+		t.Error("first call to checkAndMark, got: false, want: true")
+	}
+	o.onReported(nil)
+	for i := 0; i < 2; i++ {
+		if o.shouldReport(nil) {
+			t.Error("after first call to checkAndMark, got: true, want: false")
+		}
+	}
+}
+
+func TestArgsTracker(t *testing.T) {
+	for _, tc := range []struct {
+		name   string
+		idx    []int
+		arg1_1 uint64
+		arg1_2 uint64
+		arg2_1 uint64
+		arg2_2 uint64
+		want   bool
+	}{
+		{name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false},
+		{name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false},
+		{name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true},
+		{name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true},
+		{name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false},
+		{name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false},
+		{name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := newArgsTracker(tc.idx...)
+			regs := newRegs()
+			setArgVal(0, tc.arg1_1, regs)
+			setArgVal(1, tc.arg2_1, regs)
+			if !c.shouldReport(regs) {
+				t.Error("first call to shouldReport, got: false, want: true")
+			}
+			c.onReported(regs)
+
+			setArgVal(0, tc.arg1_2, regs)
+			setArgVal(1, tc.arg2_2, regs)
+			if got := c.shouldReport(regs); tc.want != got {
+				t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestArgsTrackerLimit(t *testing.T) {
+	c := newArgsTracker(0, 1)
+	for i := 0; i < reportLimit; i++ {
+		regs := newRegs()
+		setArgVal(0, 123, regs)
+		setArgVal(1, uint64(i), regs)
+		if !c.shouldReport(regs) {
+			t.Error("shouldReport before limit was reached, got: false, want: true")
+		}
+		c.onReported(regs)
+	}
+
+	// Should hit the count limit now.
+	regs := newRegs()
+	setArgVal(0, 123, regs)
+	setArgVal(1, 123456, regs)
+	if c.shouldReport(regs) {
+		t.Error("shouldReport after limit was reached, got: true, want: false")
+	}
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
new file mode 100644
index 000000000..6d6a705f8
--- /dev/null
+++ b/runsc/boot/config.go
@@ -0,0 +1,319 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+)
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+	// FileAccessShared sends IO requests to a Gofer process that validates the
+	// requests and forwards them to the host.
+	FileAccessShared FileAccessType = iota
+
+	// FileAccessExclusive is the same as FileAccessShared, but enables
+	// extra caching for improved performance. It should only be used if
+	// the sandbox has exclusive access to the filesystem.
+	FileAccessExclusive
+)
+
+// MakeFileAccessType converts type from string.
+func MakeFileAccessType(s string) (FileAccessType, error) {
+	switch s {
+	case "shared":
+		return FileAccessShared, nil
+	case "exclusive":
+		return FileAccessExclusive, nil
+	default:
+		return 0, fmt.Errorf("invalid file access type %q", s)
+	}
+}
+
+func (f FileAccessType) String() string {
+	switch f {
+	case FileAccessShared:
+		return "shared"
+	case FileAccessExclusive:
+		return "exclusive"
+	default:
+		return fmt.Sprintf("unknown(%d)", f)
+	}
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+	// NetworkSandbox uses internal network stack, isolated from the host.
+	NetworkSandbox NetworkType = iota
+
+	// NetworkHost redirects network related syscalls to the host network.
+	NetworkHost
+
+	// NetworkNone sets up just loopback using netstack.
+	NetworkNone
+)
+
+// MakeNetworkType converts type from string.
+func MakeNetworkType(s string) (NetworkType, error) {
+	switch s {
+	case "sandbox":
+		return NetworkSandbox, nil
+	case "host":
+		return NetworkHost, nil
+	case "none":
+		return NetworkNone, nil
+	default:
+		return 0, fmt.Errorf("invalid network type %q", s)
+	}
+}
+
+func (n NetworkType) String() string {
+	switch n {
+	case NetworkSandbox:
+		return "sandbox"
+	case NetworkHost:
+		return "host"
+	case NetworkNone:
+		return "none"
+	default:
+		return fmt.Sprintf("unknown(%d)", n)
+	}
+}
+
+// MakeWatchdogAction converts type from string.
+func MakeWatchdogAction(s string) (watchdog.Action, error) {
+	switch strings.ToLower(s) {
+	case "log", "logwarning":
+		return watchdog.LogWarning, nil
+	case "panic":
+		return watchdog.Panic, nil
+	default:
+		return 0, fmt.Errorf("invalid watchdog action %q", s)
+	}
+}
+
+// MakeRefsLeakMode converts type from string.
+func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
+	switch strings.ToLower(s) {
+	case "disabled":
+		return refs.NoLeakChecking, nil
+	case "log-names":
+		return refs.LeaksLogWarning, nil
+	case "log-traces":
+		return refs.LeaksLogTraces, nil
+	default:
+		return 0, fmt.Errorf("invalid refs leakmode %q", s)
+	}
+}
+
+func refsLeakModeToString(mode refs.LeakMode) string {
+	switch mode {
+	// If not set, default it to disabled.
+	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
+		return "disabled"
+	case refs.LeaksLogWarning:
+		return "log-names"
+	case refs.LeaksLogTraces:
+		return "log-traces"
+	default:
+		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
+	}
+}
+
+// Config holds configuration that is not part of the runtime spec.
+type Config struct {
+	// RootDir is the runtime root directory.
+	RootDir string
+
+	// Debug indicates that debug logging should be enabled.
+	Debug bool
+
+	// LogFilename is the filename to log to, if not empty.
+	LogFilename string
+
+	// LogFormat is the log format.
+	LogFormat string
+
+	// DebugLog is the path to log debug information to, if not empty.
+	DebugLog string
+
+	// PanicLog is the path to log GO's runtime messages, if not empty.
+	PanicLog string
+
+	// DebugLogFormat is the log format for debug.
+	DebugLogFormat string
+
+	// FileAccess indicates how the filesystem is accessed.
+	FileAccess FileAccessType
+
+	// Overlay is whether to wrap the root filesystem in an overlay.
+	Overlay bool
+
+	// FSGoferHostUDS enables the gofer to mount a host UDS.
+	FSGoferHostUDS bool
+
+	// Network indicates what type of network to use.
+	Network NetworkType
+
+	// EnableRaw indicates whether raw sockets should be enabled. Raw
+	// sockets are disabled by stripping CAP_NET_RAW from the list of
+	// capabilities.
+	EnableRaw bool
+
+	// HardwareGSO indicates that hardware segmentation offload is enabled.
+	HardwareGSO bool
+
+	// SoftwareGSO indicates that software segmentation offload is enabled.
+	SoftwareGSO bool
+
+	// QDisc indicates the type of queuening discipline to use by default
+	// for non-loopback interfaces.
+	QDisc QueueingDiscipline
+
+	// LogPackets indicates that all network packets should be logged.
+	LogPackets bool
+
+	// Platform is the platform to run on.
+	Platform string
+
+	// Strace indicates that strace should be enabled.
+	Strace bool
+
+	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
+	// true and this list is empty, then all syscalls will be traced.
+	StraceSyscalls []string
+
+	// StraceLogSize is the max size of data blobs to display.
+	StraceLogSize uint
+
+	// DisableSeccomp indicates whether seccomp syscall filters should be
+	// disabled. Pardon the double negation, but default to enabled is important.
+	DisableSeccomp bool
+
+	// WatchdogAction sets what action the watchdog takes when triggered.
+	WatchdogAction watchdog.Action
+
+	// PanicSignal registers signal handling that panics. Usually set to
+	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
+	PanicSignal int
+
+	// ProfileEnable is set to prepare the sandbox to be profiled.
+	ProfileEnable bool
+
+	// RestoreFile is the path to the saved container image
+	RestoreFile string
+
+	// NumNetworkChannels controls the number of AF_PACKET sockets that map
+	// to the same underlying network device. This allows netstack to better
+	// scale for high throughput use cases.
+	NumNetworkChannels int
+
+	// Rootless allows the sandbox to be started with a user that is not root.
+	// Defense is depth measures are weaker with rootless. Specifically, the
+	// sandbox and Gofer process run as root inside a user namespace with root
+	// mapped to the caller's user.
+	Rootless bool
+
+	// AlsoLogToStderr allows to send log messages to stderr.
+	AlsoLogToStderr bool
+
+	// ReferenceLeakMode sets reference leak check mode
+	ReferenceLeakMode refs.LeakMode
+
+	// OverlayfsStaleRead causes cached FDs to reopen after a file is opened for
+	// write to workaround overlayfs limitation on kernels before 4.19.
+	OverlayfsStaleRead bool
+
+	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+	// tests. It allows runsc to start the sandbox process as the current
+	// user, and without chrooting the sandbox process. This can be
+	// necessary in test environments that have limited capabilities.
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
+	// test name in the container environment variables and adds it to the debug
+	// log file name. This is done to help identify the log with the test when
+	// multiple tests are run in parallel, since there is no way to pass
+	// parameters to the runtime from docker.
+	TestOnlyTestNameEnv string
+
+	// CPUNumFromQuota sets CPU number count to available CPU quota, using
+	// least integer value greater than or equal to quota.
+	//
+	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+	CPUNumFromQuota bool
+
+	// Enables VFS2 (not plumbled through yet).
+	VFS2 bool
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+	f := []string{
+		"--root=" + c.RootDir,
+		"--debug=" + strconv.FormatBool(c.Debug),
+		"--log=" + c.LogFilename,
+		"--log-format=" + c.LogFormat,
+		"--debug-log=" + c.DebugLog,
+		"--panic-log=" + c.PanicLog,
+		"--debug-log-format=" + c.DebugLogFormat,
+		"--file-access=" + c.FileAccess.String(),
+		"--overlay=" + strconv.FormatBool(c.Overlay),
+		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
+		"--network=" + c.Network.String(),
+		"--log-packets=" + strconv.FormatBool(c.LogPackets),
+		"--platform=" + c.Platform,
+		"--strace=" + strconv.FormatBool(c.Strace),
+		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
+		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
+		"--watchdog-action=" + c.WatchdogAction.String(),
+		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
+		"--profile=" + strconv.FormatBool(c.ProfileEnable),
+		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
+		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
+		"--rootless=" + strconv.FormatBool(c.Rootless),
+		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
+		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
+		"--gso=" + strconv.FormatBool(c.HardwareGSO),
+		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
+		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
+		"--qdisc=" + c.QDisc.String(),
+	}
+	if c.CPUNumFromQuota {
+		f = append(f, "--cpu-num-from-quota")
+	}
+	// Only include these if set since it is never to be used by users.
+	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		f = append(f, "--TESTONLY-unsafe-nonroot=true")
+	}
+	if len(c.TestOnlyTestNameEnv) != 0 {
+		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
+	}
+
+	if c.VFS2 {
+		f = append(f, "--vfs2=true")
+	}
+
+	return f
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
new file mode 100644
index 000000000..8125d5061
--- /dev/null
+++ b/runsc/boot/controller.go
@@ -0,0 +1,506 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/sentry/state"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+const (
+	// ContainerCheckpoint checkpoints a container.
+	ContainerCheckpoint = "containerManager.Checkpoint"
+
+	// ContainerCreate creates a container.
+	ContainerCreate = "containerManager.Create"
+
+	// ContainerDestroy is used to stop a non-root container and free all
+	// associated resources in the sandbox.
+	ContainerDestroy = "containerManager.Destroy"
+
+	// ContainerEvent is the URPC endpoint for getting stats about the
+	// container used by "runsc events".
+	ContainerEvent = "containerManager.Event"
+
+	// ContainerExecuteAsync is the URPC endpoint for executing a command in a
+	// container.
+	ContainerExecuteAsync = "containerManager.ExecuteAsync"
+
+	// ContainerPause pauses the container.
+	ContainerPause = "containerManager.Pause"
+
+	// ContainerProcesses is the URPC endpoint for getting the list of
+	// processes running in a container.
+	ContainerProcesses = "containerManager.Processes"
+
+	// ContainerRestore restores a container from a statefile.
+	ContainerRestore = "containerManager.Restore"
+
+	// ContainerResume unpauses the paused container.
+	ContainerResume = "containerManager.Resume"
+
+	// ContainerSignal is used to send a signal to a container.
+	ContainerSignal = "containerManager.Signal"
+
+	// ContainerSignalProcess is used to send a signal to a particular
+	// process in a container.
+	ContainerSignalProcess = "containerManager.SignalProcess"
+
+	// ContainerStart is the URPC endpoint for running a non-root container
+	// within a sandbox.
+	ContainerStart = "containerManager.Start"
+
+	// ContainerWait is used to wait on the init process of the container
+	// and return its ExitStatus.
+	ContainerWait = "containerManager.Wait"
+
+	// ContainerWaitPID is used to wait on a process with a certain PID in
+	// the sandbox and return its ExitStatus.
+	ContainerWaitPID = "containerManager.WaitPID"
+
+	// NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
+	// and routes in a network stack.
+	NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
+
+	// RootContainerStart is the URPC endpoint for starting a new sandbox
+	// with root container.
+	RootContainerStart = "containerManager.StartRoot"
+
+	// SandboxStacks collects sandbox stacks for debugging.
+	SandboxStacks = "debug.Stacks"
+)
+
+// Profiling related commands (see pprof.go for more details).
+const (
+	StartCPUProfile  = "Profile.StartCPUProfile"
+	StopCPUProfile   = "Profile.StopCPUProfile"
+	HeapProfile      = "Profile.HeapProfile"
+	GoroutineProfile = "Profile.GoroutineProfile"
+	BlockProfile     = "Profile.BlockProfile"
+	MutexProfile     = "Profile.MutexProfile"
+	StartTrace       = "Profile.StartTrace"
+	StopTrace        = "Profile.StopTrace"
+)
+
+// Logging related commands (see logging.go for more details).
+const (
+	ChangeLogging = "Logging.Change"
+)
+
+// ControlSocketAddr generates an abstract unix socket name for the given ID.
+func ControlSocketAddr(id string) string {
+	return fmt.Sprintf("\x00runsc-sandbox.%s", id)
+}
+
+// controller holds the control server, and is used for communication into the
+// sandbox.
+type controller struct {
+	// srv is the control server.
+	srv *server.Server
+
+	// manager holds the containerManager methods.
+	manager *containerManager
+}
+
+// newController creates a new controller. The caller must call
+// controller.srv.StartServing() to start the controller.
+func newController(fd int, l *Loader) (*controller, error) {
+	srv, err := server.CreateFromFD(fd)
+	if err != nil {
+		return nil, err
+	}
+
+	manager := &containerManager{
+		startChan:       make(chan struct{}),
+		startResultChan: make(chan error),
+		l:               l,
+	}
+	srv.Register(manager)
+
+	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
+		net := &Network{
+			Stack: eps.Stack,
+		}
+		srv.Register(net)
+	}
+
+	srv.Register(&debug{})
+	srv.Register(&control.Logging{})
+	if l.conf.ProfileEnable {
+		srv.Register(&control.Profile{
+			Kernel: l.k,
+		})
+	}
+
+	return &controller{
+		srv:     srv,
+		manager: manager,
+	}, nil
+}
+
+// containerManager manages sandbox containers.
+type containerManager struct {
+	// startChan is used to signal when the root container process should
+	// be started.
+	startChan chan struct{}
+
+	// startResultChan is used to signal when the root container  has
+	// started. Any errors encountered during startup will be sent to the
+	// channel. A nil value indicates success.
+	startResultChan chan error
+
+	// l is the loader that creates containers and sandboxes.
+	l *Loader
+}
+
+// StartRoot will start the root container process.
+func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
+	log.Debugf("containerManager.StartRoot %q", *cid)
+	// Tell the root container to start and wait for the result.
+	cm.startChan <- struct{}{}
+	if err := <-cm.startResultChan; err != nil {
+		return fmt.Errorf("starting sandbox: %v", err)
+	}
+	return nil
+}
+
+// Processes retrieves information about processes running in the sandbox.
+func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error {
+	log.Debugf("containerManager.Processes: %q", *cid)
+	return control.Processes(cm.l.k, *cid, out)
+}
+
+// Create creates a container within a sandbox.
+func (cm *containerManager) Create(cid *string, _ *struct{}) error {
+	log.Debugf("containerManager.Create: %q", *cid)
+	return cm.l.createContainer(*cid)
+}
+
+// StartArgs contains arguments to the Start method.
+type StartArgs struct {
+	// Spec is the spec of the container to start.
+	Spec *specs.Spec
+
+	// Config is the runsc-specific configuration for the sandbox.
+	Conf *Config
+
+	// CID is the ID of the container to start.
+	CID string
+
+	// FilePayload contains, in order:
+	//   * stdin, stdout, and stderr.
+	//   * the file descriptor over which the sandbox will
+	//     request files from its root filesystem.
+	urpc.FilePayload
+}
+
+// Start runs a created container within a sandbox.
+func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Start: %+v", args)
+
+	// Validate arguments.
+	if args == nil {
+		return errors.New("start missing arguments")
+	}
+	if args.Spec == nil {
+		return errors.New("start arguments missing spec")
+	}
+	if args.Conf == nil {
+		return errors.New("start arguments missing config")
+	}
+	if args.CID == "" {
+		return errors.New("start argument missing container ID")
+	}
+	if len(args.FilePayload.Files) < 4 {
+		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
+	}
+
+	// All validation passed, logs the spec for debugging.
+	specutils.LogSpec(args.Spec)
+
+	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	if err != nil {
+		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
+		return err
+	}
+	log.Debugf("Container %q started", args.CID)
+
+	return nil
+}
+
+// Destroy stops a container if it is still running and cleans up its
+// filesystem.
+func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
+	log.Debugf("containerManager.destroy %q", *cid)
+	return cm.l.destroyContainer(*cid)
+}
+
+// ExecuteAsync starts running a command on a created or running sandbox. It
+// returns the PID of the new process.
+func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
+	log.Debugf("containerManager.ExecuteAsync: %+v", args)
+	tgid, err := cm.l.executeAsync(args)
+	if err != nil {
+		log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err)
+		return err
+	}
+	*pid = int32(tgid)
+	return nil
+}
+
+// Checkpoint pauses a sandbox and saves its state.
+func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
+	log.Debugf("containerManager.Checkpoint")
+	state := control.State{
+		Kernel:   cm.l.k,
+		Watchdog: cm.l.watchdog,
+	}
+	return state.Save(o, nil)
+}
+
+// Pause suspends a container.
+func (cm *containerManager) Pause(_, _ *struct{}) error {
+	log.Debugf("containerManager.Pause")
+	cm.l.k.Pause()
+	return nil
+}
+
+// RestoreOpts contains options related to restoring a container's file system.
+type RestoreOpts struct {
+	// FilePayload contains the state file to be restored, followed by the
+	// platform device file if necessary.
+	urpc.FilePayload
+
+	// SandboxID contains the ID of the sandbox.
+	SandboxID string
+}
+
+// Restore loads a container from a statefile.
+// The container's current kernel is destroyed, a restore environment is
+// created, and the kernel is recreated with the restore state file. The
+// container then sends the signal to start.
+func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
+	log.Debugf("containerManager.Restore")
+
+	var specFile, deviceFile *os.File
+	switch numFiles := len(o.FilePayload.Files); numFiles {
+	case 2:
+		// The device file is donated to the platform.
+		// Can't take ownership away from os.File. dup them to get a new FD.
+		fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+		if err != nil {
+			return fmt.Errorf("failed to dup file: %v", err)
+		}
+		deviceFile = os.NewFile(uintptr(fd), "platform device")
+		fallthrough
+	case 1:
+		specFile = o.FilePayload.Files[0]
+	case 0:
+		return fmt.Errorf("at least one file must be passed to Restore")
+	default:
+		return fmt.Errorf("at most two files may be passed to Restore")
+	}
+
+	// Pause the kernel while we build a new one.
+	cm.l.k.Pause()
+
+	p, err := createPlatform(cm.l.conf, deviceFile)
+	if err != nil {
+		return fmt.Errorf("creating platform: %v", err)
+	}
+	k := &kernel.Kernel{
+		Platform: p,
+	}
+	mf, err := createMemoryFile()
+	if err != nil {
+		return fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
+	networkStack := cm.l.k.RootNetworkNamespace().Stack()
+	cm.l.k = k
+
+	// Set up the restore environment.
+	mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
+	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
+	if err != nil {
+		return fmt.Errorf("creating RestoreEnvironment: %v", err)
+	}
+	fs.SetRestoreEnvironment(*renv)
+
+	// Prepare to load from the state file.
+	if eps, ok := networkStack.(*netstack.Stack); ok {
+		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
+	}
+	info, err := specFile.Stat()
+	if err != nil {
+		return err
+	}
+	if info.Size() == 0 {
+		return fmt.Errorf("file cannot be empty")
+	}
+
+	if cm.l.conf.ProfileEnable {
+		// pprof.Initialize opens /proc/self/maps, so has to be called before
+		// installing seccomp filters.
+		pprof.Initialize()
+	}
+
+	// Seccomp filters have to be applied before parsing the state file.
+	if err := cm.l.installSeccompFilters(); err != nil {
+		return err
+	}
+
+	// Load the state.
+	loadOpts := state.LoadOpts{Source: specFile}
+	if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil {
+		return err
+	}
+
+	// Since we have a new kernel we also must make a new watchdog.
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
+
+	// Change the loader fields to reflect the changes made when restoring.
+	cm.l.k = k
+	cm.l.watchdog = dog
+	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+	cm.l.restore = true
+
+	// Reinitialize the sandbox ID and processes map. Note that it doesn't
+	// restore the state of multiple containers, nor exec processes.
+	cm.l.sandboxID = o.SandboxID
+	cm.l.mu.Lock()
+	eid := execID{cid: o.SandboxID}
+	cm.l.processes = map[execID]*execProcess{
+		eid: {
+			tg: cm.l.k.GlobalInit(),
+		},
+	}
+	cm.l.mu.Unlock()
+
+	// Tell the root container to start and wait for the result.
+	cm.startChan <- struct{}{}
+	if err := <-cm.startResultChan; err != nil {
+		return fmt.Errorf("starting sandbox: %v", err)
+	}
+
+	return nil
+}
+
+// Resume unpauses a container.
+func (cm *containerManager) Resume(_, _ *struct{}) error {
+	log.Debugf("containerManager.Resume")
+	cm.l.k.Unpause()
+	return nil
+}
+
+// Wait waits for the init process in the given container.
+func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
+	log.Debugf("containerManager.Wait")
+	err := cm.l.waitContainer(*cid, waitStatus)
+	log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err)
+	return err
+}
+
+// WaitPIDArgs are arguments to the WaitPID method.
+type WaitPIDArgs struct {
+	// PID is the PID in the container's PID namespace.
+	PID int32
+
+	// CID is the container ID.
+	CID string
+}
+
+// WaitPID waits for the process with PID 'pid' in the sandbox.
+func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
+	log.Debugf("containerManager.Wait")
+	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
+}
+
+// SignalDeliveryMode enumerates different signal delivery modes.
+type SignalDeliveryMode int
+
+const (
+	// DeliverToProcess delivers the signal to the container process with
+	// the specified PID. If PID is 0, then the container init process is
+	// signaled.
+	DeliverToProcess SignalDeliveryMode = iota
+
+	// DeliverToAllProcesses delivers the signal to all processes in the
+	// container. PID must be 0.
+	DeliverToAllProcesses
+
+	// DeliverToForegroundProcessGroup delivers the signal to the
+	// foreground process group in the same TTY session as the specified
+	// process. If PID is 0, then the signal is delivered to the foreground
+	// process group for the TTY for the init process.
+	DeliverToForegroundProcessGroup
+)
+
+func (s SignalDeliveryMode) String() string {
+	switch s {
+	case DeliverToProcess:
+		return "Process"
+	case DeliverToAllProcesses:
+		return "All"
+	case DeliverToForegroundProcessGroup:
+		return "Foreground Process Group"
+	}
+	return fmt.Sprintf("unknown signal delivery mode: %d", s)
+}
+
+// SignalArgs are arguments to the Signal method.
+type SignalArgs struct {
+	// CID is the container ID.
+	CID string
+
+	// Signo is the signal to send to the process.
+	Signo int32
+
+	// PID is the process ID in the given container that will be signaled.
+	// If 0, the root container will be signalled.
+	PID int32
+
+	// Mode is the signal delivery mode.
+	Mode SignalDeliveryMode
+}
+
+// Signal sends a signal to one or more processes in a container. If args.PID
+// is 0, then the container init process is used. Depending on the
+// args.SignalDeliveryMode option, the signal may be sent directly to the
+// indicated process, to all processes in the container, or to the foreground
+// process group.
+func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Signal %+v", args)
+	return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
+}
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
new file mode 100644
index 000000000..1fb32c527
--- /dev/null
+++ b/runsc/boot/debug.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+type debug struct {
+}
+
+// Stacks collects all sandbox stacks and copies them to 'stacks'.
+func (*debug) Stacks(_ *struct{}, stacks *string) error {
+	buf := log.Stacks(true)
+	*stacks = string(buf)
+	return nil
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
new file mode 100644
index 000000000..422f4da00
--- /dev/null
+++ b/runsc/boot/events.go
@@ -0,0 +1,81 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+)
+
+// Event struct for encoding the event data to JSON. Corresponds to runc's
+// main.event struct.
+type Event struct {
+	Type string      `json:"type"`
+	ID   string      `json:"id"`
+	Data interface{} `json:"data,omitempty"`
+}
+
+// Stats is the runc specific stats structure for stability when encoding and
+// decoding stats.
+type Stats struct {
+	Memory Memory `json:"memory"`
+	Pids   Pids   `json:"pids"`
+}
+
+// Pids contains stats on processes.
+type Pids struct {
+	Current uint64 `json:"current,omitempty"`
+	Limit   uint64 `json:"limit,omitempty"`
+}
+
+// MemoryEntry contains stats on a kind of memory.
+type MemoryEntry struct {
+	Limit   uint64 `json:"limit"`
+	Usage   uint64 `json:"usage,omitempty"`
+	Max     uint64 `json:"max,omitempty"`
+	Failcnt uint64 `json:"failcnt"`
+}
+
+// Memory contains stats on memory.
+type Memory struct {
+	Cache     uint64            `json:"cache,omitempty"`
+	Usage     MemoryEntry       `json:"usage,omitempty"`
+	Swap      MemoryEntry       `json:"swap,omitempty"`
+	Kernel    MemoryEntry       `json:"kernel,omitempty"`
+	KernelTCP MemoryEntry       `json:"kernelTCP,omitempty"`
+	Raw       map[string]uint64 `json:"raw,omitempty"`
+}
+
+// Event gets the events from the container.
+func (cm *containerManager) Event(_ *struct{}, out *Event) error {
+	stats := &Stats{}
+	stats.populateMemory(cm.l.k)
+	stats.populatePIDs(cm.l.k)
+	*out = Event{Type: "stats", Data: stats}
+	return nil
+}
+
+func (s *Stats) populateMemory(k *kernel.Kernel) {
+	mem := k.MemoryFile()
+	mem.UpdateUsage()
+	_, totalUsage := usage.MemoryAccounting.Copy()
+	s.Memory.Usage = MemoryEntry{
+		Usage: totalUsage,
+	}
+}
+
+func (s *Stats) populatePIDs(k *kernel.Kernel) {
+	s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups()))
+}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
new file mode 100644
index 000000000..ed18f0047
--- /dev/null
+++ b/runsc/boot/filter/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "filter",
+    srcs = [
+        "config.go",
+        "config_amd64.go",
+        "config_arm64.go",
+        "config_profile.go",
+        "extra_filters.go",
+        "extra_filters_msan.go",
+        "extra_filters_race.go",
+        "filter.go",
+    ],
+    visibility = [
+        "//runsc/boot:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/seccomp",
+        "//pkg/sentry/platform",
+        "//pkg/tcpip/link/fdbased",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
new file mode 100644
index 000000000..1828d116a
--- /dev/null
+++ b/runsc/boot/filter/config.go
@@ -0,0 +1,549 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"os"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+)
+
+// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
+var allowedSyscalls = seccomp.SyscallRules{
+	syscall.SYS_CLOCK_GETTIME: {},
+	syscall.SYS_CLONE: []seccomp.Rule{
+		{
+			seccomp.AllowValue(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+		},
+	},
+	syscall.SYS_CLOSE: {},
+	syscall.SYS_DUP:   {},
+	syscall.SYS_DUP3: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.O_CLOEXEC),
+		},
+	},
+	syscall.SYS_EPOLL_CREATE1: {},
+	syscall.SYS_EPOLL_CTL:     {},
+	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EVENTFD2: []seccomp.Rule{
+		{
+			seccomp.AllowValue(0),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EXIT:       {},
+	syscall.SYS_EXIT_GROUP: {},
+	syscall.SYS_FALLOCATE:  {},
+	syscall.SYS_FCHMOD:     {},
+	syscall.SYS_FCNTL: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFL),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_SETFL),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFD),
+		},
+	},
+	syscall.SYS_FSTAT:     {},
+	syscall.SYS_FSYNC:     {},
+	syscall.SYS_FTRUNCATE: {},
+	syscall.SYS_FUTEX: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+		},
+		// Non-private variants are included for flipcall support. They are otherwise
+		// unncessary, as the sentry will use only private futexes internally.
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE),
+			seccomp.AllowAny{},
+		},
+	},
+	syscall.SYS_GETPID: {},
+	unix.SYS_GETRANDOM: {},
+	syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_DOMAIN),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_TYPE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_ERROR),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_SNDBUF),
+		},
+	},
+	syscall.SYS_GETTID:       {},
+	syscall.SYS_GETTIMEOFDAY: {},
+	// SYS_IOCTL is needed for terminal support, but we only allow
+	// setting/getting termios and winsize.
+	syscall.SYS_IOCTL: []seccomp.Rule{
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCGETS),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETS),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETSF),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETSW),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TIOCSWINSZ),
+			seccomp.AllowAny{}, /* winsize struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TIOCGWINSZ),
+			seccomp.AllowAny{}, /* winsize struct */
+		},
+	},
+	syscall.SYS_LSEEK:   {},
+	syscall.SYS_MADVISE: {},
+	syscall.SYS_MINCORE: {},
+	// Used by the Go runtime as a temporarily workaround for a Linux
+	// 5.2-5.4 bug.
+	//
+	// See src/runtime/os_linux_x86.go.
+	//
+	// TODO(b/148688965): Remove once this is gone from Go.
+	syscall.SYS_MLOCK: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(4096),
+		},
+	},
+	syscall.SYS_MMAP: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_SHARED),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+		},
+	},
+	syscall.SYS_MPROTECT:  {},
+	syscall.SYS_MUNMAP:    {},
+	syscall.SYS_NANOSLEEP: {},
+	syscall.SYS_PPOLL:     {},
+	syscall.SYS_PREAD64:   {},
+	syscall.SYS_PREADV:    {},
+	syscall.SYS_PWRITE64:  {},
+	syscall.SYS_PWRITEV:   {},
+	syscall.SYS_READ:      {},
+	syscall.SYS_RECVMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+		},
+	},
+	syscall.SYS_RECVMMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
+			seccomp.AllowValue(syscall.MSG_DONTWAIT),
+			seccomp.AllowValue(0),
+		},
+	},
+	unix.SYS_SENDMMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_RESTART_SYSCALL: {},
+	syscall.SYS_RT_SIGACTION:    {},
+	syscall.SYS_RT_SIGPROCMASK:  {},
+	syscall.SYS_RT_SIGRETURN:    {},
+	syscall.SYS_SCHED_YIELD:     {},
+	syscall.SYS_SENDMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+		},
+	},
+	syscall.SYS_SETITIMER: {},
+	syscall.SYS_SHUTDOWN: []seccomp.Rule{
+		// Used by fs/host to shutdown host sockets.
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+		// Used by unet to shutdown connections.
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+	},
+	syscall.SYS_SIGALTSTACK:     {},
+	unix.SYS_STATX:              {},
+	syscall.SYS_SYNC_FILE_RANGE: {},
+	syscall.SYS_TGKILL: []seccomp.Rule{
+		{
+			seccomp.AllowValue(uint64(os.Getpid())),
+		},
+	},
+	syscall.SYS_UTIMENSAT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* null pathname */
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* flags */
+		},
+	},
+	syscall.SYS_WRITE: {},
+	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
+	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
+	// option is enabled for a packet socket.
+	syscall.SYS_WRITEV: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(2),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(3),
+		},
+	},
+}
+
+// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
+func hostInetFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_ACCEPT4: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+			},
+		},
+		syscall.SYS_BIND:        {},
+		syscall.SYS_CONNECT:     {},
+		syscall.SYS_GETPEERNAME: {},
+		syscall.SYS_GETSOCKNAME: {},
+		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_ERROR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_KEEPALIVE),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_SNDBUF),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_RCVBUF),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_REUSEADDR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_TYPE),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_LINGER),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_NODELAY),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_INFO),
+			},
+		},
+		syscall.SYS_IOCTL: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.TIOCOUTQ),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.TIOCINQ),
+			},
+		},
+		syscall.SYS_LISTEN:   {},
+		syscall.SYS_READV:    {},
+		syscall.SYS_RECVFROM: {},
+		syscall.SYS_RECVMSG:  {},
+		syscall.SYS_SENDMSG:  {},
+		syscall.SYS_SENDTO:   {},
+		syscall.SYS_SETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_SNDBUF),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_RCVBUF),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_REUSEADDR),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_NODELAY),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+		},
+		syscall.SYS_SHUTDOWN: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_RD),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_WR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_RDWR),
+			},
+		},
+		syscall.SYS_SOCKET: []seccomp.Rule{
+			{
+				seccomp.AllowValue(syscall.AF_INET),
+				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET),
+				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET6),
+				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET6),
+				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+		},
+		syscall.SYS_WRITEV: {},
+	}
+}
+
+func controlServerFilters(fd int) seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_ACCEPT: []seccomp.Rule{
+			{
+				seccomp.AllowValue(fd),
+			},
+		},
+		syscall.SYS_LISTEN: []seccomp.Rule{
+			{
+				seccomp.AllowValue(fd),
+				seccomp.AllowValue(16 /* unet.backlog */),
+			},
+		},
+		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_PEERCRED),
+			},
+		},
+	}
+}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
new file mode 100644
index 000000000..5335ff82c
--- /dev/null
+++ b/runsc/boot/filter/config_amd64.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
+		seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	)
+}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
new file mode 100644
index 000000000..7fa9bbda3
--- /dev/null
+++ b/runsc/boot/filter/config_arm64.go
@@ -0,0 +1,21 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+// Reserve for future customization.
+func init() {
+}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
new file mode 100644
index 000000000..194952a7b
--- /dev/null
+++ b/runsc/boot/filter/config_profile.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
new file mode 100644
index 000000000..e28d4b8d6
--- /dev/null
+++ b/runsc/boot/filter/extra_filters.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+import (
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go instrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() seccomp.SyscallRules {
+	return nil
+}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
new file mode 100644
index 000000000..5e5a3c998
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -0,0 +1,32 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+	Report("MSAN is enabled: syscall filters less restrictive!")
+	return seccomp.SyscallRules{
+		syscall.SYS_SCHED_GETAFFINITY: {},
+		syscall.SYS_SET_ROBUST_LIST:   {},
+	}
+}
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
new file mode 100644
index 000000000..9ff80276a
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+	Report("TSAN is enabled: syscall filters less restrictive!")
+	return seccomp.SyscallRules{
+		syscall.SYS_BRK:             {},
+		syscall.SYS_CLONE:           {},
+		syscall.SYS_FUTEX:           {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_MUNLOCK:         {},
+		syscall.SYS_NANOSLEEP:       {},
+		syscall.SYS_OPEN:            {},
+		syscall.SYS_OPENAT:          {},
+		syscall.SYS_SET_ROBUST_LIST: {},
+		// Used within glibc's malloc.
+		syscall.SYS_TIME: {},
+	}
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
new file mode 100644
index 000000000..e80c171b3
--- /dev/null
+++ b/runsc/boot/filter/filter.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the sandbox is allowed to make
+// to the host, and installs seccomp filters to prevent prohibited
+// syscalls in case it's compromised.
+package filter
+
+import (
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/seccomp"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+)
+
+// Options are seccomp filter related options.
+type Options struct {
+	Platform      platform.Platform
+	HostNetwork   bool
+	ProfileEnable bool
+	ControllerFD  int
+}
+
+// Install installs seccomp filters for based on the given platform.
+func Install(opt Options) error {
+	s := allowedSyscalls
+	s.Merge(controlServerFilters(opt.ControllerFD))
+
+	// Set of additional filters used by -race and -msan. Returns empty
+	// when not enabled.
+	s.Merge(instrumentationFilters())
+
+	if opt.HostNetwork {
+		Report("host networking enabled: syscall filters less restrictive!")
+		s.Merge(hostInetFilters())
+	}
+	if opt.ProfileEnable {
+		Report("profile enabled: syscall filters less restrictive!")
+		s.Merge(profileFilters())
+	}
+
+	s.Merge(opt.Platform.SyscallFilters())
+
+	return seccomp.Install(s)
+}
+
+// Report writes a warning message to the log.
+func Report(msg string) {
+	log.Warningf("*** SECCOMP WARNING: %s", msg)
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
new file mode 100644
index 000000000..4875452e2
--- /dev/null
+++ b/runsc/boot/fs.go
@@ -0,0 +1,1046 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"syscall"
+
+	// Include filesystem types that OCI spec might mount.
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
+	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+const (
+	// Filesystem name for 9p gofer mounts.
+	rootFsName = "9p"
+
+	// Device name for root mount.
+	rootDevice = "9pfs-/"
+
+	// MountPrefix is the annotation prefix for mount hints.
+	MountPrefix = "dev.gvisor.spec.mount."
+
+	// Filesystems that runsc supports.
+	bind     = "bind"
+	devpts   = "devpts"
+	devtmpfs = "devtmpfs"
+	proc     = "proc"
+	sysfs    = "sysfs"
+	tmpfs    = "tmpfs"
+	nonefs   = "none"
+)
+
+// tmpfs has some extra supported options that we must pass through.
+var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
+
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+	// Upper layer uses the same flags as lower, but it must be read-write.
+	upperFlags := lowerFlags
+	upperFlags.ReadOnly = false
+
+	tmpFS := mustFindFilesystem("tmpfs")
+	if !fs.IsDir(lower.StableAttr) {
+		// Create overlay on top of mount file, e.g. /etc/hostname.
+		msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags)
+		return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
+	}
+
+	// Create overlay on top of mount dir.
+	upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
+	}
+
+	// Replicate permissions and owner from lower to upper mount point.
+	attr, err := lower.UnstableAttr(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("reading attributes from lower mount point: %v", err)
+	}
+	if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) {
+		return nil, fmt.Errorf("error setting permission to upper mount point")
+	}
+	if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil {
+		return nil, fmt.Errorf("setting owner to upper mount point: %v", err)
+	}
+
+	return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
+}
+
+// compileMounts returns the supported mounts from the mount spec, adding any
+// mandatory mounts that are required by the OCI specification.
+func compileMounts(spec *specs.Spec) []specs.Mount {
+	// Keep track of whether proc and sys were mounted.
+	var procMounted, sysMounted bool
+	var mounts []specs.Mount
+
+	// Always mount /dev.
+	mounts = append(mounts, specs.Mount{
+		Type:        devtmpfs,
+		Destination: "/dev",
+	})
+
+	mounts = append(mounts, specs.Mount{
+		Type:        devpts,
+		Destination: "/dev/pts",
+	})
+
+	// Mount all submounts from the spec.
+	for _, m := range spec.Mounts {
+		if !specutils.IsSupportedDevMount(m) {
+			log.Warningf("ignoring dev mount at %q", m.Destination)
+			continue
+		}
+		mounts = append(mounts, m)
+		switch filepath.Clean(m.Destination) {
+		case "/proc":
+			procMounted = true
+		case "/sys":
+			sysMounted = true
+		}
+	}
+
+	// Mount proc and sys even if the user did not ask for it, as the spec
+	// says we SHOULD.
+	var mandatoryMounts []specs.Mount
+	if !procMounted {
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        proc,
+			Destination: "/proc",
+		})
+	}
+	if !sysMounted {
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        sysfs,
+			Destination: "/sys",
+		})
+	}
+
+	// The mandatory mounts should be ordered right after the root, in case
+	// there are submounts of these mandatory mounts already in the spec.
+	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
+
+	return mounts
+}
+
+// p9MountOptions creates a slice of options for a p9 mount.
+func p9MountOptions(fd int, fa FileAccessType) []string {
+	opts := []string{
+		"trans=fd",
+		"rfdno=" + strconv.Itoa(fd),
+		"wfdno=" + strconv.Itoa(fd),
+		"privateunixsocket=true",
+	}
+	if fa == FileAccessShared {
+		opts = append(opts, "cache=remote_revalidating")
+	}
+	return opts
+}
+
+// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
+// keys.
+func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
+	var out []string
+	for _, o := range opts {
+		ok, err := parseMountOption(o, allowedKeys...)
+		if err != nil {
+			return nil, err
+		}
+		if ok {
+			out = append(out, o)
+		}
+	}
+	return out, nil
+}
+
+func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
+	kv := strings.SplitN(opt, "=", 3)
+	if len(kv) > 2 {
+		return false, fmt.Errorf("invalid option %q", opt)
+	}
+	return specutils.ContainsStr(allowedKeys, kv[0]), nil
+}
+
+// mountDevice returns a device string based on the fs type and target
+// of the mount.
+func mountDevice(m specs.Mount) string {
+	if m.Type == bind {
+		// Make a device string that includes the target, which is consistent across
+		// S/R and uniquely identifies the connection.
+		return "9pfs-" + m.Destination
+	}
+	// All other fs types use device "none".
+	return "none"
+}
+
+func mountFlags(opts []string) fs.MountSourceFlags {
+	mf := fs.MountSourceFlags{}
+	// Note: changes to supported options must be reflected in
+	// isSupportedMountFlag() as well.
+	for _, o := range opts {
+		switch o {
+		case "rw":
+			mf.ReadOnly = false
+		case "ro":
+			mf.ReadOnly = true
+		case "noatime":
+			mf.NoAtime = true
+		case "noexec":
+			mf.NoExec = true
+		case "bind", "rbind":
+			// When options include either "bind" or "rbind",
+			// it's converted to a 9P mount.
+		default:
+			log.Warningf("ignoring unknown mount option %q", o)
+		}
+	}
+	return mf
+}
+
+func isSupportedMountFlag(fstype, opt string) bool {
+	switch opt {
+	case "rw", "ro", "noatime", "noexec":
+		return true
+	}
+	if fstype == tmpfs {
+		ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
+		return ok && err == nil
+	}
+	return false
+}
+
+func mustFindFilesystem(name string) fs.Filesystem {
+	fs, ok := fs.FindFilesystem(name)
+	if !ok {
+		panic(fmt.Sprintf("could not find filesystem %q", name))
+	}
+	return fs
+}
+
+// addSubmountOverlay overlays the inode over a ramfs tree containing the given
+// paths.
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+	// Construct a ramfs tree of mount points. The contents never
+	// change, so this can be fully caching. There's no real
+	// filesystem backing this tree, so we set the filesystem to
+	// nil.
+	msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{})
+	mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount tree: %v", err)
+	}
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	if err != nil {
+		return nil, fmt.Errorf("adding mount overlay: %v", err)
+	}
+	return overlayInode, err
+}
+
+// subtargets takes a set of Mounts and returns only the targets that are
+// children of the given root. The returned paths are relative to the root.
+func subtargets(root string, mnts []specs.Mount) []string {
+	var targets []string
+	for _, mnt := range mnts {
+		if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
+			targets = append(targets, relPath)
+		}
+	}
+	return targets
+}
+
+func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if conf.VFS2 {
+		return setupContainerVFS2(ctx, conf, mntr, procArgs)
+	}
+	mns, err := mntr.setupFS(conf, procArgs)
+	if err != nil {
+		return err
+	}
+
+	// Set namespace here so that it can be found in ctx.
+	procArgs.MountNamespace = mns
+
+	return setExecutablePath(ctx, procArgs)
+}
+
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
+	paths := fs.GetPath(procArgs.Envv)
+	exe := procArgs.Argv[0]
+	f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+	if err != nil {
+		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+	}
+	procArgs.Filename = f
+	return nil
+}
+
+func adjustDirentCache(k *kernel.Kernel) error {
+	var hl syscall.Rlimit
+	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
+	}
+	if int64(hl.Cur) != syscall.RLIM_INFINITY {
+		newSize := hl.Cur / 2
+		if newSize < gofer.DefaultDirentCacheSize {
+			log.Infof("Setting gofer dirent cache size to %d", newSize)
+			gofer.DefaultDirentCacheSize = newSize
+			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+		}
+	}
+	return nil
+}
+
+type fdDispenser struct {
+	fds []int
+}
+
+func (f *fdDispenser) remove() int {
+	if f.empty() {
+		panic("fdDispenser out of fds")
+	}
+	rv := f.fds[0]
+	f.fds = f.fds[1:]
+	return rv
+}
+
+func (f *fdDispenser) empty() bool {
+	return len(f.fds) == 0
+}
+
+type shareType int
+
+const (
+	invalid shareType = iota
+
+	// container shareType indicates that the mount is used by a single container.
+	container
+
+	// pod shareType indicates that the mount is used by more than one container
+	// inside the pod.
+	pod
+
+	// shared shareType indicates that the mount can also be shared with a process
+	// outside the pod, e.g. NFS.
+	shared
+)
+
+func parseShare(val string) (shareType, error) {
+	switch val {
+	case "container":
+		return container, nil
+	case "pod":
+		return pod, nil
+	case "shared":
+		return shared, nil
+	default:
+		return 0, fmt.Errorf("invalid share value %q", val)
+	}
+}
+
+func (s shareType) String() string {
+	switch s {
+	case invalid:
+		return "invalid"
+	case container:
+		return "container"
+	case pod:
+		return "pod"
+	case shared:
+		return "shared"
+	default:
+		return fmt.Sprintf("invalid share value %d", s)
+	}
+}
+
+// mountHint represents extra information about mounts that are provided via
+// annotations. They can override mount type, and provide sharing information
+// so that mounts can be correctly shared inside the pod.
+type mountHint struct {
+	name  string
+	share shareType
+	mount specs.Mount
+
+	// root is the inode where the volume is mounted. For mounts with 'pod' share
+	// the volume is mounted once and then bind mounted inside the containers.
+	root *fs.Inode
+}
+
+func (m *mountHint) setField(key, val string) error {
+	switch key {
+	case "source":
+		if len(val) == 0 {
+			return fmt.Errorf("source cannot be empty")
+		}
+		m.mount.Source = val
+	case "type":
+		return m.setType(val)
+	case "share":
+		share, err := parseShare(val)
+		if err != nil {
+			return err
+		}
+		m.share = share
+	case "options":
+		return m.setOptions(val)
+	default:
+		return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
+	}
+	return nil
+}
+
+func (m *mountHint) setType(val string) error {
+	switch val {
+	case "tmpfs", "bind":
+		m.mount.Type = val
+	default:
+		return fmt.Errorf("invalid type %q", val)
+	}
+	return nil
+}
+
+func (m *mountHint) setOptions(val string) error {
+	opts := strings.Split(val, ",")
+	if err := specutils.ValidateMountOptions(opts); err != nil {
+		return err
+	}
+	// Sort options so it can be compared with container mount options later on.
+	sort.Strings(opts)
+	m.mount.Options = opts
+	return nil
+}
+
+func (m *mountHint) isSupported() bool {
+	return m.mount.Type == tmpfs && m.share == pod
+}
+
+// checkCompatible verifies that shared mount is compatible with master.
+// For now enforce that all options are the same. Once bind mount is properly
+// supported, then we should ensure the master is less restrictive than the
+// container, e.g. master can be 'rw' while container mounts as 'ro'.
+func (m *mountHint) checkCompatible(mount specs.Mount) error {
+	// Remove options that don't affect to mount's behavior.
+	masterOpts := filterUnsupportedOptions(m.mount)
+	slaveOpts := filterUnsupportedOptions(mount)
+
+	if len(masterOpts) != len(slaveOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	}
+
+	sort.Strings(masterOpts)
+	sort.Strings(slaveOpts)
+	for i, opt := range masterOpts {
+		if opt != slaveOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		}
+	}
+	return nil
+}
+
+func (m *mountHint) fileAccessType() FileAccessType {
+	if m.share == container {
+		return FileAccessExclusive
+	}
+	return FileAccessShared
+}
+
+func filterUnsupportedOptions(mount specs.Mount) []string {
+	rv := make([]string, 0, len(mount.Options))
+	for _, o := range mount.Options {
+		if isSupportedMountFlag(mount.Type, o) {
+			rv = append(rv, o)
+		}
+	}
+	return rv
+}
+
+// podMountHints contains a collection of mountHints for the pod.
+type podMountHints struct {
+	mounts map[string]*mountHint
+}
+
+func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
+	mnts := make(map[string]*mountHint)
+	for k, v := range spec.Annotations {
+		// Look for 'dev.gvisor.spec.mount' annotations and parse them.
+		if strings.HasPrefix(k, MountPrefix) {
+			// Remove the prefix and split the rest.
+			parts := strings.Split(k[len(MountPrefix):], ".")
+			if len(parts) != 2 {
+				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
+			}
+			name := parts[0]
+			if len(name) == 0 {
+				return nil, fmt.Errorf("invalid mount name: %s", name)
+			}
+			mnt := mnts[name]
+			if mnt == nil {
+				mnt = &mountHint{name: name}
+				mnts[name] = mnt
+			}
+			if err := mnt.setField(parts[1], v); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	// Validate all hints after done parsing.
+	for name, m := range mnts {
+		log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
+		if m.share == invalid {
+			return nil, fmt.Errorf("share field for %q has not been set", m.name)
+		}
+		if len(m.mount.Source) == 0 {
+			return nil, fmt.Errorf("source field for %q has not been set", m.name)
+		}
+		if len(m.mount.Type) == 0 {
+			return nil, fmt.Errorf("type field for %q has not been set", m.name)
+		}
+
+		// Check for duplicate mount sources.
+		for name2, m2 := range mnts {
+			if name != name2 && m.mount.Source == m2.mount.Source {
+				return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
+			}
+		}
+	}
+
+	return &podMountHints{mounts: mnts}, nil
+}
+
+func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
+	for _, m := range p.mounts {
+		if m.mount.Source == mount.Source {
+			return m
+		}
+	}
+	return nil
+}
+
+type containerMounter struct {
+	root *specs.Root
+
+	// mounts is the set of submounts for the container. It's a copy from the spec
+	// that may be freely modified without affecting the original spec.
+	mounts []specs.Mount
+
+	// fds is the list of FDs to be dispensed for mounts that require it.
+	fds fdDispenser
+
+	k *kernel.Kernel
+
+	hints *podMountHints
+}
+
+func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+	return &containerMounter{
+		root:   spec.Root,
+		mounts: compileMounts(spec),
+		fds:    fdDispenser{fds: goferFDs},
+		k:      k,
+		hints:  hints,
+	}
+}
+
+// processHints processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHints(conf *Config) error {
+	if conf.VFS2 {
+		return nil
+	}
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs {
+			continue
+		}
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		inode, err := c.mountSharedMaster(ctx, conf, hint)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.root = inode
+	}
+	return nil
+}
+
+// setupFS is used to set up the file system for all containers. This is the
+// main entry point method, with most of the other being internal only. It
+// returns the mount namespace that is created for the container.
+func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+	log.Infof("Configuring container's file system")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := rootProcArgs.NewContext(c.k)
+
+	mns, err := c.createMountNamespace(rootCtx, conf)
+	if err != nil {
+		return nil, err
+	}
+
+	// Set namespace here so that it can be found in rootCtx.
+	rootProcArgs.MountNamespace = mns
+
+	if err := c.mountSubmounts(rootCtx, conf, mns); err != nil {
+		return nil, err
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+	rootInode, err := c.createRootMount(ctx, conf)
+	if err != nil {
+		return nil, fmt.Errorf("creating filesystem for container: %v", err)
+	}
+	mns, err := fs.NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		return nil, fmt.Errorf("creating new mount namespace for container: %v", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+	root := mns.Root()
+	defer root.DecRef()
+
+	for _, m := range c.mounts {
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
+		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+			}
+		}
+	}
+
+	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+		return fmt.Errorf("mount submount %q: %v", "tmp", err)
+	}
+
+	if err := c.checkDispenser(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (c *containerMounter) checkDispenser() error {
+	if !c.fds.empty() {
+		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
+	}
+	return nil
+}
+
+// mountSharedMaster mounts the master of a volume that is shared among
+// containers in a pod. It returns the root mount's inode.
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+
+	// Mount with revalidate because it's shared among containers.
+	opts = append(opts, "cache=revalidate")
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(hint.mount.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of shared mount %q", hint.name)
+		inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return inode, nil
+}
+
+// createRootMount creates the root filesystem.
+func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+	// First construct the filesystem from the spec.Root.
+	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
+
+	fd := c.fds.remove()
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	p9FS := mustFindFilesystem("9p")
+	opts := p9MountOptions(fd, conf.FileAccess)
+
+	if conf.OverlayfsStaleRead {
+		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
+		// can only send mount options for specs.Mounts (specs.Root is missing
+		// Options field). So assume root is always on top of overlayfs.
+		opts = append(opts, "overlayfs_stale_read")
+	}
+
+	rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating root mount point: %v", err)
+	}
+
+	// We need to overlay the root on top of a ramfs with stub directories
+	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
+	// mounted even if they are not in the spec.
+	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("adding submount overlay: %v", err)
+	}
+
+	if conf.Overlay && !c.root.Readonly {
+		log.Debugf("Adding overlay on top of root mount")
+		// Overlay a tmpfs filesystem on top of the root.
+		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	log.Infof("Mounted %q to %q type root", c.root.Path, "/")
+	return rootInode, nil
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+	)
+
+	for _, opt := range m.Options {
+		// When options include either "bind" or "rbind", this behaves as
+		// bind mount even if the mount type is equal to a filesystem supported
+		// on runsc.
+		if opt == "bind" || opt == "rbind" {
+			m.Type = bind
+			break
+		}
+	}
+
+	switch m.Type {
+	case devpts, devtmpfs, proc, sysfs:
+		fsName = m.Type
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
+		fsName = m.Type
+
+		var err error
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		if err != nil {
+			return "", nil, false, err
+		}
+
+	case bind:
+		fd := c.fds.remove()
+		fsName = "9p"
+		opts = p9MountOptions(fd, c.getMountAccessType(m))
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+	default:
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, opts, useOverlay, nil
+}
+
+func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+	if hint := c.hints.findMount(mount); hint != nil {
+		return hint.fileAccessType()
+	}
+	// Non-root bind mounts are always shared if no hints were provided.
+	return FileAccessShared
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(m.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		err := fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		// Check to see if this is a common error due to a Linux bug.
+		// This error is generated here in order to cause it to be
+		// printed to the user using Docker via 'runsc create' etc. rather
+		// than simply printed to the logs for the 'runsc boot' command.
+		//
+		// We check the error message string rather than type because the
+		// actual error types (syscall.EIO, syscall.EPIPE) are lost by file system
+		// implementation (e.g. p9).
+		// TODO(gvisor.dev/issue/1765): Remove message when bug is resolved.
+		if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) {
+			return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug"))
+		}
+		return err
+	}
+
+	// If there are submounts, we need to overlay the mount on top of a ramfs
+	// with stub directories for submount paths.
+	submounts := subtargets(m.Destination, c.mounts)
+	if len(submounts) > 0 {
+		log.Infof("Adding submount overlay over %q", m.Destination)
+		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		if err != nil {
+			return fmt.Errorf("adding submount overlay: %v", err)
+		}
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of mount %q", m.Destination)
+		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+		if err != nil {
+			return err
+		}
+	}
+
+	maxTraversals := uint(0)
+	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+	}
+	defer dirent.DecRef()
+	if err := mns.Mount(ctx, dirent, inode); err != nil {
+		return fmt.Errorf("mount %q error: %v", m.Destination, err)
+	}
+
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
+	return nil
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
+	if err := source.checkCompatible(mount); err != nil {
+		return err
+	}
+
+	maxTraversals := uint(0)
+	target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
+	}
+	defer target.DecRef()
+
+	// Take a ref on the inode that is about to be (re)-mounted.
+	source.root.IncRef()
+	if err := mns.Mount(ctx, target, source.root); err != nil {
+		source.root.DecRef()
+		return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
+	}
+
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return nil
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	newMount := fs.MountArgs{
+		Dev:        mountDevice(m),
+		Flags:      mountFlags(m.Options),
+		DataString: strings.Join(opts, ","),
+	}
+	if useOverlay {
+		newMount.Flags.ReadOnly = true
+	}
+	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+	log.Infof("Added mount at %q: %+v", fsName, newMount)
+	return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
+// the mounts to the environment.
+func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+	renv := &fs.RestoreEnvironment{
+		MountSources: make(map[string][]fs.MountArgs),
+	}
+
+	// Add root mount.
+	fd := c.fds.remove()
+	opts := p9MountOptions(fd, conf.FileAccess)
+
+	mf := fs.MountSourceFlags{}
+	if c.root.Readonly || conf.Overlay {
+		mf.ReadOnly = true
+	}
+
+	rootMount := fs.MountArgs{
+		Dev:        rootDevice,
+		Flags:      mf,
+		DataString: strings.Join(opts, ","),
+	}
+	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+
+	// Add submounts.
+	var tmpMounted bool
+	for _, m := range c.mounts {
+		if err := c.addRestoreMount(conf, renv, m); err != nil {
+			return nil, err
+		}
+		if filepath.Clean(m.Destination) == "/tmp" {
+			tmpMounted = true
+		}
+	}
+
+	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+	if !tmpMounted {
+		tmpMount := specs.Mount{
+			Type:        tmpfs,
+			Destination: "/tmp",
+		}
+		if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
+			return nil, err
+		}
+	}
+
+	return renv, nil
+}
+
+// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+//   1. /tmp is mounted explicitly: we should not override user's wish
+//   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+	for _, m := range c.mounts {
+		if filepath.Clean(m.Destination) == "/tmp" {
+			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
+			return nil
+		}
+	}
+
+	maxTraversals := uint(0)
+	tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
+	switch err {
+	case nil:
+		// Found '/tmp' in filesystem, check if it's empty.
+		defer tmp.DecRef()
+		f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
+		if err != nil {
+			return err
+		}
+		defer f.DecRef()
+		serializer := &fs.CollectEntriesSerializer{}
+		if err := f.Readdir(ctx, serializer); err != nil {
+			return err
+		}
+		// If more than "." and ".." is found, skip internal tmpfs to prevent hiding
+		// existing files.
+		if len(serializer.Order) > 2 {
+			log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
+			return nil
+		}
+		log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
+		fallthrough
+
+	case syserror.ENOENT:
+		// No '/tmp' found (or fallthrough from above). Safe to mount internal
+		// tmpfs.
+		tmpMount := specs.Mount{
+			Type:        tmpfs,
+			Destination: "/tmp",
+			// Sticky bit is added to prevent accidental deletion of files from
+			// another user. This is normally done for /tmp.
+			Options: []string{"mode=1777"},
+		}
+		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
+
+	default:
+		return err
+	}
+}
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
new file mode 100644
index 000000000..912037075
--- /dev/null
+++ b/runsc/boot/fs_test.go
@@ -0,0 +1,250 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestPodMountHintsHappy(t *testing.T) {
+	spec := &specs.Spec{
+		Annotations: map[string]string{
+			MountPrefix + "mount1.source": "foo",
+			MountPrefix + "mount1.type":   "tmpfs",
+			MountPrefix + "mount1.share":  "pod",
+
+			MountPrefix + "mount2.source":  "bar",
+			MountPrefix + "mount2.type":    "bind",
+			MountPrefix + "mount2.share":   "container",
+			MountPrefix + "mount2.options": "rw,private",
+		},
+	}
+	podHints, err := newPodMountHints(spec)
+	if err != nil {
+		t.Fatalf("newPodMountHints failed: %v", err)
+	}
+
+	// Check that fields were set correctly.
+	mount1 := podHints.mounts["mount1"]
+	if want := "mount1"; want != mount1.name {
+		t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name)
+	}
+	if want := "foo"; want != mount1.mount.Source {
+		t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source)
+	}
+	if want := "tmpfs"; want != mount1.mount.Type {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type)
+	}
+	if want := pod; want != mount1.share {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share)
+	}
+	if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options)
+	}
+
+	mount2 := podHints.mounts["mount2"]
+	if want := "mount2"; want != mount2.name {
+		t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name)
+	}
+	if want := "bar"; want != mount2.mount.Source {
+		t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source)
+	}
+	if want := "bind"; want != mount2.mount.Type {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type)
+	}
+	if want := container; want != mount2.share {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share)
+	}
+	if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options)
+	}
+}
+
+func TestPodMountHintsErrors(t *testing.T) {
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		error       string
+	}{
+		{
+			name: "too short",
+			annotations: map[string]string{
+				MountPrefix + "mount1": "foo",
+			},
+			error: "invalid mount annotation",
+		},
+		{
+			name: "no name",
+			annotations: map[string]string{
+				MountPrefix + ".source": "foo",
+			},
+			error: "invalid mount name",
+		},
+		{
+			name: "missing source",
+			annotations: map[string]string{
+				MountPrefix + "mount1.type":  "tmpfs",
+				MountPrefix + "mount1.share": "pod",
+			},
+			error: "source field",
+		},
+		{
+			name: "missing type",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			error: "type field",
+		},
+		{
+			name: "missing share",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+			},
+			error: "share field",
+		},
+		{
+			name: "invalid field name",
+			annotations: map[string]string{
+				MountPrefix + "mount1.invalid": "foo",
+			},
+			error: "invalid mount annotation",
+		},
+		{
+			name: "invalid source",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			error: "source cannot be empty",
+		},
+		{
+			name: "invalid type",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "invalid-type",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			error: "invalid type",
+		},
+		{
+			name: "invalid share",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "invalid-share",
+			},
+			error: "invalid share",
+		},
+		{
+			name: "invalid options",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source":  "foo",
+				MountPrefix + "mount1.type":    "tmpfs",
+				MountPrefix + "mount1.share":   "pod",
+				MountPrefix + "mount1.options": "invalid-option",
+			},
+			error: "unknown mount option",
+		},
+		{
+			name: "duplicate source",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
+
+				MountPrefix + "mount2.source": "foo",
+				MountPrefix + "mount2.type":   "bind",
+				MountPrefix + "mount2.share":  "container",
+			},
+			error: "have the same mount source",
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err == nil || !strings.Contains(err.Error(), tst.error) {
+				t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err)
+			}
+			if podHints != nil {
+				t.Errorf("newPodMountHints must return nil on failure: %+v", podHints)
+			}
+		})
+	}
+}
+
+func TestGetMountAccessType(t *testing.T) {
+	const source = "foo"
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		want        FileAccessType
+	}{
+		{
+			name: "container=exclusive",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
+			},
+			want: FileAccessExclusive,
+		},
+		{
+			name: "pod=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "shared=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "shared",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "default=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source + "mismatch",
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
+			},
+			want: FileAccessShared,
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err != nil {
+				t.Fatalf("newPodMountHints failed: %v", err)
+			}
+			mounter := containerMounter{hints: podHints}
+			if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want {
+				t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got)
+			}
+		})
+	}
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
new file mode 100644
index 000000000..ce62236e5
--- /dev/null
+++ b/runsc/boot/limits.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Mapping from linux resource names to limits.LimitType.
+var fromLinuxResource = map[string]limits.LimitType{
+	"RLIMIT_AS":         limits.AS,
+	"RLIMIT_CORE":       limits.Core,
+	"RLIMIT_CPU":        limits.CPU,
+	"RLIMIT_DATA":       limits.Data,
+	"RLIMIT_FSIZE":      limits.FileSize,
+	"RLIMIT_LOCKS":      limits.Locks,
+	"RLIMIT_MEMLOCK":    limits.MemoryLocked,
+	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
+	"RLIMIT_NICE":       limits.Nice,
+	"RLIMIT_NOFILE":     limits.NumberOfFiles,
+	"RLIMIT_NPROC":      limits.ProcessCount,
+	"RLIMIT_RSS":        limits.Rss,
+	"RLIMIT_RTPRIO":     limits.RealTimePriority,
+	"RLIMIT_RTTIME":     limits.Rttime,
+	"RLIMIT_SIGPENDING": limits.SignalsPending,
+	"RLIMIT_STACK":      limits.Stack,
+}
+
+func findName(lt limits.LimitType) string {
+	for k, v := range fromLinuxResource {
+		if v == lt {
+			return k
+		}
+	}
+	return "unknown"
+}
+
+var defaults defs
+
+type defs struct {
+	mu  sync.Mutex
+	set *limits.LimitSet
+	err error
+}
+
+func (d *defs) get() (*limits.LimitSet, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.err != nil {
+		return nil, d.err
+	}
+	if d.set == nil {
+		if err := d.initDefaults(); err != nil {
+			d.err = err
+			return nil, err
+		}
+	}
+	return d.set, nil
+}
+
+func (d *defs) initDefaults() error {
+	ls, err := limits.NewLinuxLimitSet()
+	if err != nil {
+		return err
+	}
+
+	// Set default limits based on what containers get by default, ex:
+	// $ docker run --rm debian prlimit
+	ls.SetUnchecked(limits.AS, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Core, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.CPU, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
+	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
+	ls.SetUnchecked(limits.ProcessCount, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Rss, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.RealTimePriority, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.Rttime, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity})
+
+	// Read host limits that directly affect the sandbox and adjust the defaults
+	// based on them.
+	for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} {
+		var hl syscall.Rlimit
+		if err := syscall.Getrlimit(res, &hl); err != nil {
+			return err
+		}
+
+		lt, ok := limits.FromLinuxResource[res]
+		if !ok {
+			return fmt.Errorf("unknown rlimit type %v", res)
+		}
+		hostLimit := limits.Limit{
+			Cur: limits.FromLinux(hl.Cur),
+			Max: limits.FromLinux(hl.Max),
+		}
+
+		defaultLimit := ls.Get(lt)
+		if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur {
+			log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur)
+		}
+		if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max {
+			log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max)
+			ls.SetUnchecked(lt, hostLimit)
+		}
+	}
+
+	d.set = ls
+	return nil
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+	ls, err := defaults.get()
+	if err != nil {
+		return nil, err
+	}
+
+	// Then apply overwrites on top of defaults.
+	for _, rl := range spec.Process.Rlimits {
+		lt, ok := fromLinuxResource[rl.Type]
+		if !ok {
+			return nil, fmt.Errorf("unknown resource %q", rl.Type)
+		}
+		ls.SetUnchecked(lt, limits.Limit{
+			Cur: rl.Soft,
+			Max: rl.Hard,
+		})
+	}
+	return ls, nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
new file mode 100644
index 000000000..8c8bad11c
--- /dev/null
+++ b/runsc/boot/loader.go
@@ -0,0 +1,1264 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package boot loads the kernel and runs a container.
+package boot
+
+import (
+	"fmt"
+	mrand "math/rand"
+	"os"
+	"runtime"
+	"sync/atomic"
+	"syscall"
+	gtime "time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/fdimport"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/sighandling"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/runsc/boot/filter"
+	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/specutils"
+
+	// Include supported socket providers.
+	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
+)
+
+// Loader keeps state needed to start the kernel and run the container..
+type Loader struct {
+	// k is the kernel.
+	k *kernel.Kernel
+
+	// ctrl is the control server.
+	ctrl *controller
+
+	conf *Config
+
+	// console is set to true if terminal is enabled.
+	console bool
+
+	watchdog *watchdog.Watchdog
+
+	// stdioFDs contains stdin, stdout, and stderr.
+	stdioFDs []int
+
+	// goferFDs are the FDs that attach the sandbox to the gofers.
+	goferFDs []int
+
+	// spec is the base configuration for the root container.
+	spec *specs.Spec
+
+	// stopSignalForwarding disables forwarding of signals to the sandboxed
+	// container. It should be called when a sandbox is destroyed.
+	stopSignalForwarding func()
+
+	// restore is set to true if we are restoring a container.
+	restore bool
+
+	// rootProcArgs refers to the root sandbox init task.
+	rootProcArgs kernel.CreateProcessArgs
+
+	// sandboxID is the ID for the whole sandbox.
+	sandboxID string
+
+	// mu guards processes.
+	mu sync.Mutex
+
+	// processes maps containers init process and invocation of exec. Root
+	// processes are keyed with container ID and pid=0, while exec invocations
+	// have the corresponding pid set.
+	//
+	// processes is guardded by mu.
+	processes map[execID]*execProcess
+
+	// mountHints provides extra information about mounts for containers that
+	// apply to the entire pod.
+	mountHints *podMountHints
+}
+
+// execID uniquely identifies a sentry process that is executed in a container.
+type execID struct {
+	cid string
+	pid kernel.ThreadID
+}
+
+// execProcess contains the thread group and host TTY of a sentry process.
+type execProcess struct {
+	// tg will be nil for containers that haven't started yet.
+	tg *kernel.ThreadGroup
+
+	// tty will be nil if the process is not attached to a terminal.
+	tty *host.TTYFileOperations
+
+	// tty will be nil if the process is not attached to a terminal.
+	ttyVFS2 *hostvfs2.TTYFileDescription
+
+	// pidnsPath is the pid namespace path in spec
+	pidnsPath string
+}
+
+func init() {
+	// Initialize the random number generator.
+	mrand.Seed(gtime.Now().UnixNano())
+}
+
+// Args are the arguments for New().
+type Args struct {
+	// Id is the sandbox ID.
+	ID string
+	// Spec is the sandbox specification.
+	Spec *specs.Spec
+	// Conf is the system configuration.
+	Conf *Config
+	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
+	// of this FD and may close it at any time.
+	ControllerFD int
+	// Device is an optional argument that is passed to the platform. The Loader
+	// takes ownership of this file and may close it at any time.
+	Device *os.File
+	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
+	// takes ownership of these FDs and may close them at any time.
+	GoferFDs []int
+	// StdioFDs is the stdio for the application. The Loader takes ownership of
+	// these FDs and may close them at any time.
+	StdioFDs []int
+	// Console is set to true if using TTY.
+	Console bool
+	// NumCPU is the number of CPUs to create inside the sandbox.
+	NumCPU int
+	// TotalMem is the initial amount of total memory to report back to the
+	// container.
+	TotalMem uint64
+	// UserLogFD is the file descriptor to write user logs to.
+	UserLogFD int
+}
+
+// make sure stdioFDs are always the same on initial start and on restore
+const startingStdioFD = 64
+
+// New initializes a new kernel loader configured by spec.
+// New also handles setting up a kernel for restoring a container.
+func New(args Args) (*Loader, error) {
+	// We initialize the rand package now to make sure /dev/urandom is pre-opened
+	// on kernels that do not support getrandom(2).
+	if err := rand.Init(); err != nil {
+		return nil, fmt.Errorf("setting up rand: %v", err)
+	}
+
+	if err := usage.Init(); err != nil {
+		return nil, fmt.Errorf("setting up memory usage: %v", err)
+	}
+
+	// Is this a VFSv2 kernel?
+	if args.Conf.VFS2 {
+		kernel.VFS2Enabled = true
+		vfs2.Override()
+	}
+
+	// Create kernel and platform.
+	p, err := createPlatform(args.Conf, args.Device)
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+	k := &kernel.Kernel{
+		Platform: p,
+	}
+
+	// Create memory file.
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
+
+	// Create VDSO.
+	//
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	//
+	// FIXME(b/109889800): Use non-nil context.
+	vdso, err := loader.PrepareVDSO(nil, k)
+	if err != nil {
+		return nil, fmt.Errorf("creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	if err := enableStrace(args.Conf); err != nil {
+		return nil, fmt.Errorf("enabling strace: %v", err)
+	}
+
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
+	if err != nil {
+		return nil, fmt.Errorf("creating network: %v", err)
+	}
+
+	// Create capabilities.
+	caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
+	if err != nil {
+		return nil, fmt.Errorf("converting capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
+	for _, GID := range args.Spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials.
+	creds := auth.NewUserCredentials(
+		auth.KUID(args.Spec.Process.User.UID),
+		auth.KGID(args.Spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		auth.NewRootUserNamespace())
+
+	if args.NumCPU == 0 {
+		args.NumCPU = runtime.NumCPU()
+	}
+	log.Infof("CPUs: %d", args.NumCPU)
+
+	if args.TotalMem > 0 {
+		// Adjust the total memory returned by the Sentry so that applications that
+		// use /proc/meminfo can make allocations based on this limit.
+		usage.MinimumTotalMemoryBytes = args.TotalMem
+		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
+	}
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		FeatureSet:                  cpuid.HostFeatureSet(),
+		Timekeeper:                  tk,
+		RootUserNamespace:           creds.UserNamespace,
+		RootNetworkNamespace:        netns,
+		ApplicationCores:            uint(args.NumCPU),
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
+	}); err != nil {
+		return nil, fmt.Errorf("initializing kernel: %v", err)
+	}
+
+	if err := adjustDirentCache(k); err != nil {
+		return nil, err
+	}
+
+	// Turn on packet logging if enabled.
+	if args.Conf.LogPackets {
+		log.Infof("Packet logging enabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 1)
+	} else {
+		log.Infof("Packet logging disabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 0)
+	}
+
+	// Create a watchdog.
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
+
+	procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+	if err != nil {
+		return nil, fmt.Errorf("creating init process for root container: %v", err)
+	}
+
+	if err := initCompatLogs(args.UserLogFD); err != nil {
+		return nil, fmt.Errorf("initializing compat logs: %v", err)
+	}
+
+	mountHints, err := newPodMountHints(args.Spec)
+	if err != nil {
+		return nil, fmt.Errorf("creating pod mount hints: %v", err)
+	}
+
+	if kernel.VFS2Enabled {
+		// Set up host mount that will be used for imported fds.
+		hostFilesystem := hostvfs2.NewFilesystem(k.VFS())
+		defer hostFilesystem.DecRef()
+		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
+		}
+		k.SetHostMount(hostMount)
+	}
+
+	// Make host FDs stable between invocations. Host FDs must map to the exact
+	// same number when the sandbox is restored. Otherwise the wrong FD will be
+	// used.
+	var stdioFDs []int
+	newfd := startingStdioFD
+	for _, fd := range args.StdioFDs {
+		err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+		}
+		stdioFDs = append(stdioFDs, newfd)
+		err = syscall.Close(fd)
+		if err != nil {
+			return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+		}
+		newfd++
+	}
+
+	eid := execID{cid: args.ID}
+	l := &Loader{
+		k:            k,
+		conf:         args.Conf,
+		console:      args.Console,
+		watchdog:     dog,
+		spec:         args.Spec,
+		goferFDs:     args.GoferFDs,
+		stdioFDs:     stdioFDs,
+		rootProcArgs: procArgs,
+		sandboxID:    args.ID,
+		processes:    map[execID]*execProcess{eid: {}},
+		mountHints:   mountHints,
+	}
+
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
+	}
+
+	// Create the control server using the provided FD.
+	//
+	// This must be done *after* we have initialized the kernel since the
+	// controller is used to configure the kernel's network stack.
+	ctrl, err := newController(args.ControllerFD, l)
+	if err != nil {
+		return nil, fmt.Errorf("creating control server: %v", err)
+	}
+	l.ctrl = ctrl
+
+	// Only start serving after Loader is set to controller and controller is set
+	// to Loader, because they are both used in the urpc methods.
+	if err := ctrl.srv.StartServing(); err != nil {
+		return nil, fmt.Errorf("starting control server: %v", err)
+	}
+
+	return l, nil
+}
+
+// newProcess creates a process that can be run with kernel.CreateProcess.
+func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+	// Create initial limits.
+	ls, err := createLimitSet(spec)
+	if err != nil {
+		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
+	}
+
+	wd := spec.Process.Cwd
+	if wd == "" {
+		wd = "/"
+	}
+
+	// Create the process arguments.
+	procArgs := kernel.CreateProcessArgs{
+		Argv:                    spec.Process.Args,
+		Envv:                    spec.Process.Env,
+		WorkingDirectory:        wd,
+		Credentials:             creds,
+		Umask:                   0022,
+		Limits:                  ls,
+		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
+		UTSNamespace:            k.RootUTSNamespace(),
+		IPCNamespace:            k.RootIPCNamespace(),
+		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
+		ContainerID:             id,
+		PIDNamespace:            pidns,
+	}
+
+	return procArgs, nil
+}
+
+// Destroy cleans up all resources used by the loader.
+//
+// Note that this will block until all open control server connections have
+// been closed. For that reason, this should NOT be called in a defer, because
+// a panic in a control server rpc would then hang forever.
+func (l *Loader) Destroy() {
+	if l.ctrl != nil {
+		l.ctrl.srv.Stop()
+	}
+	if l.stopSignalForwarding != nil {
+		l.stopSignalForwarding()
+	}
+	l.watchdog.Stop()
+}
+
+func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+	p, err := platform.Lookup(conf.Platform)
+	if err != nil {
+		panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
+	}
+	log.Infof("Platform: %s", conf.Platform)
+	return p.New(deviceFile)
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "runsc-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
+	// there are memory cgroups specified, because at this point we're already
+	// in a mount namespace in which the relevant cgroupfs is not visible.
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
+
+func (l *Loader) installSeccompFilters() error {
+	if l.conf.DisableSeccomp {
+		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+	} else {
+		opts := filter.Options{
+			Platform:      l.k.Platform,
+			HostNetwork:   l.conf.Network == NetworkHost,
+			ProfileEnable: l.conf.ProfileEnable,
+			ControllerFD:  l.ctrl.srv.FD(),
+		}
+		if err := filter.Install(opts); err != nil {
+			return fmt.Errorf("installing seccomp filters: %v", err)
+		}
+	}
+	return nil
+}
+
+// Run runs the root container.
+func (l *Loader) Run() error {
+	err := l.run()
+	l.ctrl.manager.startResultChan <- err
+	if err != nil {
+		// Give the controller some time to send the error to the
+		// runtime. If we return too quickly here the process will exit
+		// and the control connection will be closed before the error
+		// is returned.
+		gtime.Sleep(2 * gtime.Second)
+		return err
+	}
+	return nil
+}
+
+func (l *Loader) run() error {
+	if l.conf.Network == NetworkHost {
+		// Delay host network configuration to this point because network namespace
+		// is configured after the loader is created and before Run() is called.
+		log.Debugf("Configuring host network")
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
+		if err := stack.Configure(); err != nil {
+			return err
+		}
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: l.sandboxID}
+	ep, ok := l.processes[eid]
+	if !ok {
+		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
+	}
+
+	// If we are restoring, we do not want to create a process.
+	// l.restore is set by the container manager when a restore call is made.
+	var ttyFile *host.TTYFileOperations
+	var ttyFileVFS2 *hostvfs2.TTYFileDescription
+	if !l.restore {
+		if l.conf.ProfileEnable {
+			pprof.Initialize()
+		}
+
+		// Finally done with all configuration. Setup filters before user code
+		// is loaded.
+		if err := l.installSeccompFilters(); err != nil {
+			return err
+		}
+
+		// Create the FD map, which will set stdin, stdout, and stderr.  If console
+		// is true, then ioctl calls will be passed through to the host fd.
+		ctx := l.rootProcArgs.NewContext(l.k)
+		var err error
+
+		// CreateProcess takes a reference on FDMap if successful. We won't need
+		// ours either way.
+		l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
+		if err != nil {
+			return fmt.Errorf("importing fds: %v", err)
+		}
+
+		// Setup the root container file system.
+		l.startGoferMonitor(l.sandboxID, l.goferFDs)
+
+		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+		if err := mntr.processHints(l.conf); err != nil {
+			return err
+		}
+		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
+			return err
+		}
+
+		// Add the HOME enviroment variable if it is not already set.
+		var envv []string
+		if kernel.VFS2Enabled {
+			envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+
+		} else {
+			envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		}
+		if err != nil {
+			return err
+		}
+		l.rootProcArgs.Envv = envv
+
+		// Create the root container init task. It will begin running
+		// when the kernel is started.
+		if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+			return fmt.Errorf("creating init process: %v", err)
+		}
+
+		// CreateProcess takes a reference on FDTable if successful.
+		l.rootProcArgs.FDTable.DecRef()
+	}
+
+	ep.tg = l.k.GlobalInit()
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+		ep.pidnsPath = ns.Path
+	}
+	if l.console {
+		// Set the foreground process group on the TTY to the global init process
+		// group, since that is what we are about to start running.
+		switch {
+		case ttyFileVFS2 != nil:
+			ep.ttyVFS2 = ttyFileVFS2
+			ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		case ttyFile != nil:
+			ep.tty = ttyFile
+			ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		}
+	}
+
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
+
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if l.console {
+			// Since we are running with a console, we should forward the signal to
+			// the foreground process group so that job control signals like ^C can
+			// be handled properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+		}
+	})
+
+	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
+	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
+	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
+	// passed FDs, so only close for VFS1.
+	if !kernel.VFS2Enabled {
+		for _, fd := range l.stdioFDs {
+			err := syscall.Close(fd)
+			if err != nil {
+				return fmt.Errorf("close dup()ed stdioFDs: %v", err)
+			}
+		}
+	}
+
+	log.Infof("Process should have started...")
+	l.watchdog.Start()
+	return l.k.Start()
+}
+
+// createContainer creates a new container inside the sandbox.
+func (l *Loader) createContainer(cid string) error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; ok {
+		return fmt.Errorf("container %q already exists", cid)
+	}
+	l.processes[eid] = &execProcess{}
+	return nil
+}
+
+// startContainer starts a child container. It returns the thread group ID of
+// the newly created process. Caller owns 'files' and may close them after
+// this method returns.
+func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+	// Create capabilities.
+	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
+	if err != nil {
+		return fmt.Errorf("creating capabilities: %v", err)
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; !ok {
+		return fmt.Errorf("trying to start a deleted container %q", cid)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+	for _, GID := range spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials. We reuse the root user namespace because the
+	// sentry currently supports only 1 mount namespace, which is tied to a
+	// single user namespace. Thus we must run in the same user namespace
+	// to access mounts.
+	creds := auth.NewUserCredentials(
+		auth.KUID(spec.Process.User.UID),
+		auth.KGID(spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		l.k.RootUserNamespace())
+
+	var pidns *kernel.PIDNamespace
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok {
+		if ns.Path != "" {
+			for _, p := range l.processes {
+				if ns.Path == p.pidnsPath {
+					pidns = p.tg.PIDNamespace()
+					break
+				}
+			}
+		}
+		if pidns == nil {
+			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
+		}
+		l.processes[eid].pidnsPath = ns.Path
+	} else {
+		pidns = l.k.RootPIDNamespace()
+	}
+	procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+	if err != nil {
+		return fmt.Errorf("creating new process: %v", err)
+	}
+
+	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
+	var stdioFDs []int
+	for _, f := range files[:3] {
+		stdioFDs = append(stdioFDs, int(f.Fd()))
+	}
+
+	// Create the FD map, which will set stdin, stdout, and stderr.
+	ctx := procArgs.NewContext(l.k)
+	fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
+	if err != nil {
+		return fmt.Errorf("importing fds: %v", err)
+	}
+	// CreateProcess takes a reference on fdTable if successful. We won't
+	// need ours either way.
+	procArgs.FDTable = fdTable
+
+	// Can't take ownership away from os.File. dup them to get a new FDs.
+	var goferFDs []int
+	for _, f := range files[3:] {
+		fd, err := syscall.Dup(int(f.Fd()))
+		if err != nil {
+			return fmt.Errorf("failed to dup file: %v", err)
+		}
+		goferFDs = append(goferFDs, fd)
+	}
+
+	// Setup the child container file system.
+	l.startGoferMonitor(cid, goferFDs)
+
+	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
+	if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
+		return err
+	}
+
+	// Create and start the new process.
+	tg, _, err := l.k.CreateProcess(procArgs)
+	if err != nil {
+		return fmt.Errorf("creating process: %v", err)
+	}
+	l.k.StartProcess(tg)
+
+	// CreateProcess takes a reference on FDTable if successful.
+	procArgs.FDTable.DecRef()
+
+	l.processes[eid].tg = tg
+	return nil
+}
+
+// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
+// the gofer FDs looking for disconnects, and destroys the container if a
+// disconnect occurs in any of the gofer FDs.
+func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+	go func() {
+		log.Debugf("Monitoring gofer health for container %q", cid)
+		var events []unix.PollFd
+		for _, fd := range goferFDs {
+			events = append(events, unix.PollFd{
+				Fd:     int32(fd),
+				Events: unix.POLLHUP | unix.POLLRDHUP,
+			})
+		}
+		_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
+			// Use ppoll instead of poll because it's already whilelisted in seccomp.
+			n, err := unix.Ppoll(events, nil, nil)
+			return uintptr(n), 0, err
+		})
+		if err != nil {
+			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
+		}
+
+		// Check if the gofer has stopped as part of normal container destruction.
+		// This is done just to avoid sending an annoying error message to the log.
+		// Note that there is a small race window in between mu.Unlock() and the
+		// lock being reacquired in destroyContainer(), but it's harmless to call
+		// destroyContainer() multiple times.
+		l.mu.Lock()
+		_, ok := l.processes[execID{cid: cid}]
+		l.mu.Unlock()
+		if ok {
+			log.Infof("Gofer socket disconnected, destroying container %q", cid)
+			if err := l.destroyContainer(cid); err != nil {
+				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+			}
+		}
+	}()
+}
+
+// destroyContainer stops a container if it is still running and cleans up its
+// filesystem.
+func (l *Loader) destroyContainer(cid string) error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
+	if err != nil {
+		// Container doesn't exist.
+		return err
+	}
+
+	// The container exists, but has it been started?
+	if tg != nil {
+		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
+		}
+		// Wait for all processes that belong to the container to exit (including
+		// exec'd processes).
+		for _, t := range l.k.TaskSet().Root.Tasks() {
+			if t.ContainerID() == cid {
+				t.ThreadGroup().WaitExited()
+			}
+		}
+
+		// At this point, all processes inside of the container have exited,
+		// releasing all references to the container's MountNamespace and
+		// causing all submounts and overlays to be unmounted.
+		//
+		// Since the container's MountNamespace has been released,
+		// MountNamespace.destroy() will have executed, but that function may
+		// trigger async close operations. We must wait for those to complete
+		// before returning, otherwise the caller may kill the gofer before
+		// they complete, causing a cascade of failing RPCs.
+		fs.AsyncBarrier()
+	}
+
+	// No more failure from this point on. Remove all container thread groups
+	// from the map.
+	for key := range l.processes {
+		if key.cid == cid {
+			delete(l.processes, key)
+		}
+	}
+
+	log.Debugf("Container destroyed %q", cid)
+	return nil
+}
+
+func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
+	// Hold the lock for the entire operation to ensure that exec'd process is
+	// added to 'processes' in case it races with destroyContainer().
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
+	if err != nil {
+		return 0, err
+	}
+	if tg == nil {
+		return 0, fmt.Errorf("container %q not started", args.ContainerID)
+	}
+
+	// Get the container MountNamespace from the Task.
+	if kernel.VFS2Enabled {
+		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
+		args.MountNamespaceVFS2.IncRef()
+	} else {
+		tg.Leader().WithMuLocked(func(t *kernel.Task) {
+			// task.MountNamespace() does not take a ref, so we must do so ourselves.
+			args.MountNamespace = t.MountNamespace()
+			args.MountNamespace.IncRef()
+		})
+	}
+
+	// Add the HOME environment variable if it is not already set.
+	if kernel.VFS2Enabled {
+		defer args.MountNamespaceVFS2.DecRef()
+
+		root := args.MountNamespaceVFS2.Root()
+		defer root.DecRef()
+		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
+	} else {
+		defer args.MountNamespace.DecRef()
+
+		root := args.MountNamespace.Root()
+		defer root.DecRef()
+		ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+		envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
+	}
+
+	// Start the process.
+	proc := control.Proc{Kernel: l.k}
+	args.PIDNamespace = tg.PIDNamespace()
+	newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args)
+	if err != nil {
+		return 0, err
+	}
+
+	eid := execID{cid: args.ContainerID, pid: tgid}
+	l.processes[eid] = &execProcess{
+		tg:      newTG,
+		tty:     ttyFile,
+		ttyVFS2: ttyFileVFS2,
+	}
+	log.Debugf("updated processes: %v", l.processes)
+
+	return tgid, nil
+}
+
+// waitContainer waits for the init process of a container to exit.
+func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
+	// Don't defer unlock, as doing so would make it impossible for
+	// multiple clients to wait on the same container.
+	tg, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("can't wait for container %q: %v", cid, err)
+	}
+
+	// If the thread either has already exited or exits during waiting,
+	// consider the container exited.
+	ws := l.wait(tg)
+	*waitStatus = ws
+	return nil
+}
+
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
+	if tgid <= 0 {
+		return fmt.Errorf("PID (%d) must be positive", tgid)
+	}
+
+	// Try to find a process that was exec'd
+	eid := execID{cid: cid, pid: tgid}
+	execTG, err := l.threadGroupFromID(eid)
+	if err == nil {
+		ws := l.wait(execTG)
+		*waitStatus = ws
+
+		l.mu.Lock()
+		delete(l.processes, eid)
+		log.Debugf("updated processes (removal): %v", l.processes)
+		l.mu.Unlock()
+		return nil
+	}
+
+	// The caller may be waiting on a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace.
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("waiting for PID %d: %v", tgid, err)
+	}
+	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+	if tg == nil {
+		return fmt.Errorf("waiting for PID %d: no such process", tgid)
+	}
+	if tg.Leader().ContainerID() != cid {
+		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+	}
+	ws := l.wait(tg)
+	*waitStatus = ws
+	return nil
+}
+
+// wait waits for the process with TGID 'tgid' in a container's PID namespace
+// to exit.
+func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
+	tg.WaitExited()
+	return tg.ExitStatus().Status()
+}
+
+// WaitForStartSignal waits for a start signal from the control server.
+func (l *Loader) WaitForStartSignal() {
+	<-l.ctrl.manager.startChan
+}
+
+// WaitExit waits for the root container to exit, and returns its exit status.
+func (l *Loader) WaitExit() kernel.ExitStatus {
+	// Wait for container.
+	l.k.WaitExited()
+
+	return l.k.GlobalInit().ExitStatus()
+}
+
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
+	switch conf.Network {
+	case NetworkHost:
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
+
+	case NetworkNone, NetworkSandbox:
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
+		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
+		}
+		return inet.NewRootNamespace(s, creator), nil
+
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
+
+}
+
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
+
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %v", err)
+	}
+
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
+	s.FillDefaultIPTables()
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
+	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
+	}
+
+	return s, nil
+}
+
+// signal sends a signal to one or more processes in a container. If PID is 0,
+// then the container init process is used. Depending on the SignalDeliveryMode
+// option, the signal may be sent directly to the indicated process, to all
+// processes in the container, or to the foreground process group.
+func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
+	if pid < 0 {
+		return fmt.Errorf("PID (%d) must be positive", pid)
+	}
+
+	switch mode {
+	case DeliverToProcess:
+		if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
+			return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err)
+		}
+		return nil
+
+	case DeliverToForegroundProcessGroup:
+		if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
+			return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err)
+		}
+		return nil
+
+	case DeliverToAllProcesses:
+		if pid != 0 {
+			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
+		}
+		// Check that the container has actually started before signaling it.
+		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
+			return err
+		}
+		if err := l.signalAllProcesses(cid, signo); err != nil {
+			return fmt.Errorf("signaling all processes in container %q: %v", cid, err)
+		}
+		return nil
+
+	default:
+		panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
+	}
+}
+
+func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
+	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	if err == nil {
+		// Send signal directly to the identified process.
+		return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
+	}
+
+	// The caller may be signaling a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace and
+	// signal it.
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+	if tg == nil {
+		return fmt.Errorf("no such process with PID %d", tgid)
+	}
+	if tg.Leader().ContainerID() != cid {
+		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+	}
+	return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
+}
+
+// signalForegrondProcessGroup looks up foreground process group from the TTY
+// for the given "tgid" inside container "cid", and send the signal to it.
+func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
+	l.mu.Lock()
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
+	if err != nil {
+		l.mu.Unlock()
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+	if tg == nil {
+		l.mu.Unlock()
+		return fmt.Errorf("container %q not started", cid)
+	}
+
+	tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
+	l.mu.Unlock()
+	if err != nil {
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+
+	var pg *kernel.ProcessGroup
+	switch {
+	case ttyVFS2 != nil:
+		pg = ttyVFS2.ForegroundProcessGroup()
+	case tty != nil:
+		pg = tty.ForegroundProcessGroup()
+	default:
+		return fmt.Errorf("no TTY attached")
+	}
+	if pg == nil {
+		// No foreground process group has been set. Signal the
+		// original thread group.
+		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
+		return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
+	}
+	// Send the signal to all processes in the process group.
+	var lastErr error
+	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+		if tg.ProcessGroup() != pg {
+			continue
+		}
+		if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil {
+			lastErr = err
+		}
+	}
+	return lastErr
+}
+
+// signalAllProcesses that belong to specified container. It's a noop if the
+// container hasn't started or has exited.
+func (l *Loader) signalAllProcesses(cid string, signo int32) error {
+	// Pause the kernel to prevent new processes from being created while
+	// the signal is delivered. This prevents process leaks when SIGKILL is
+	// sent to the entire container.
+	l.k.Pause()
+	defer l.k.Unpause()
+	return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
+}
+
+// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
+// acquires mutex before calling it and fails in case container hasn't started
+// yet.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	tg, err := l.tryThreadGroupFromIDLocked(key)
+	if err != nil {
+		return nil, err
+	}
+	if tg == nil {
+		return nil, fmt.Errorf("container %q not started", key.cid)
+	}
+	return tg, nil
+}
+
+// tryThreadGroupFromIDLocked returns the thread group for the given execution
+// ID. It may return nil in case the container has not started yet. Returns
+// error if execution ID is invalid or if the container cannot be found (maybe
+// it has been deleted). Caller must hold 'mu'.
+func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, fmt.Errorf("container %q not found", key.cid)
+	}
+	return ep.tg, nil
+}
+
+// ttyFromIDLocked returns the TTY files for the given execution ID. It may
+// return nil in case the container has not started yet. Returns error if
+// execution ID is invalid or if the container cannot be found (maybe it has
+// been deleted). Caller must hold 'mu'.
+func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, nil, fmt.Errorf("container %q not found", key.cid)
+	}
+	return ep.tty, ep.ttyVFS2, nil
+}
+
+func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	if len(stdioFDs) != 3 {
+		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+	}
+
+	k := kernel.KernelFromContext(ctx)
+	fdTable := k.NewFDTable()
+	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
+	if err != nil {
+		fdTable.DecRef()
+		return nil, nil, nil, err
+	}
+	return fdTable, ttyFile, ttyFileVFS2, nil
+}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
new file mode 100644
index 000000000..e448fd773
--- /dev/null
+++ b/runsc/boot/loader_test.go
@@ -0,0 +1,715 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"math/rand"
+	"os"
+	"reflect"
+	"syscall"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/fsgofer"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+	rand.Seed(time.Now().UnixNano())
+	if err := fsgofer.OpenProcSelfFD(); err != nil {
+		panic(err)
+	}
+}
+
+func testConfig() *Config {
+	return &Config{
+		RootDir:        "unused_root_dir",
+		Network:        NetworkNone,
+		DisableSeccomp: true,
+		Platform:       "ptrace",
+	}
+}
+
+// testSpec returns a simple spec that can be used in tests.
+func testSpec() *specs.Spec {
+	return &specs.Spec{
+		// The host filesystem root is the sandbox root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: []string{"/bin/true"},
+		},
+	}
+}
+
+// startGofer starts a new gofer routine serving 'root' path. It returns the
+// sandbox side of the connection, and a function that when called will stop the
+// gofer.
+func startGofer(root string) (int, func(), error) {
+	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		return 0, nil, err
+	}
+	sandboxEnd, goferEnd := fds[0], fds[1]
+
+	socket, err := unet.NewSocket(goferEnd)
+	if err != nil {
+		syscall.Close(sandboxEnd)
+		syscall.Close(goferEnd)
+		return 0, nil, fmt.Errorf("error creating server on FD %d: %v", goferEnd, err)
+	}
+	at, err := fsgofer.NewAttachPoint(root, fsgofer.Config{ROMount: true})
+	if err != nil {
+		return 0, nil, err
+	}
+	go func() {
+		s := p9.NewServer(at)
+		if err := s.Handle(socket); err != nil {
+			log.Infof("Gofer is stopping. FD: %d, err: %v\n", goferEnd, err)
+		}
+	}()
+	// Closing the gofer socket will stop the gofer and exit goroutine above.
+	cleanup := func() {
+		if err := socket.Close(); err != nil {
+			log.Warningf("Error closing gofer socket: %v", err)
+		}
+	}
+	return sandboxEnd, cleanup, nil
+}
+
+func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) {
+	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
+	if err != nil {
+		return nil, nil, err
+	}
+	conf := testConfig()
+	conf.VFS2 = vfsEnabled
+
+	sandEnd, cleanup, err := startGofer(spec.Root.Path)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Loader takes ownership of stdio.
+	var stdio []int
+	for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
+		newFd, err := unix.Dup(int(f.Fd()))
+		if err != nil {
+			return nil, nil, err
+		}
+		stdio = append(stdio, newFd)
+	}
+
+	args := Args{
+		ID:           "foo",
+		Spec:         spec,
+		Conf:         conf,
+		ControllerFD: fd,
+		GoferFDs:     []int{sandEnd},
+		StdioFDs:     stdio,
+	}
+	l, err := New(args)
+	if err != nil {
+		cleanup()
+		return nil, nil, err
+	}
+	return l, cleanup, nil
+}
+
+// TestRun runs a simple application in a sandbox and checks that it succeeds.
+func TestRun(t *testing.T) {
+	doRun(t, false)
+}
+
+// TestRunVFS2 runs TestRun in VFSv2.
+func TestRunVFS2(t *testing.T) {
+	doRun(t, true)
+}
+
+func doRun(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
+	if err != nil {
+		t.Fatalf("error creating loader: %v", err)
+	}
+
+	defer l.Destroy()
+	defer cleanup()
+
+	// Start a goroutine to read the start chan result, otherwise Run will
+	// block forever.
+	var resultChanErr error
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		resultChanErr = <-l.ctrl.manager.startResultChan
+		wg.Done()
+	}()
+
+	// Run the container.
+	if err := l.Run(); err != nil {
+		t.Errorf("error running container: %v", err)
+	}
+
+	// We should have not gotten an error on the startResultChan.
+	wg.Wait()
+	if resultChanErr != nil {
+		t.Errorf("error on startResultChan: %v", resultChanErr)
+	}
+
+	// Wait for the application to exit.  It should succeed.
+	if status := l.WaitExit(); status.Code != 0 || status.Signo != 0 {
+		t.Errorf("application exited with status %+v, want 0", status)
+	}
+}
+
+// TestStartSignal tests that the controller Start message will cause
+// WaitForStartSignal to return.
+func TestStartSignal(t *testing.T) {
+	doStartSignal(t, false)
+}
+
+// TestStartSignalVFS2 does TestStartSignal with VFS2.
+func TestStartSignalVFS2(t *testing.T) {
+	doStartSignal(t, true)
+}
+
+func doStartSignal(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
+	if err != nil {
+		t.Fatalf("error creating loader: %v", err)
+	}
+	defer l.Destroy()
+	defer cleanup()
+
+	// We aren't going to wait on this application, so the control server
+	// needs to be shut down manually.
+	defer l.ctrl.srv.Stop()
+
+	// Start a goroutine that calls WaitForStartSignal and writes to a
+	// channel when it returns.
+	waitFinished := make(chan struct{})
+	go func() {
+		l.WaitForStartSignal()
+		// Pretend that Run() executed and returned no error.
+		l.ctrl.manager.startResultChan <- nil
+		waitFinished <- struct{}{}
+	}()
+
+	// Nothing has been written to the channel, so waitFinished should not
+	// return.  Give it a little bit of time to make sure the goroutine has
+	// started.
+	select {
+	case <-waitFinished:
+		t.Errorf("WaitForStartSignal completed but it should not have")
+	case <-time.After(50 * time.Millisecond):
+		// OK.
+	}
+
+	// Trigger the control server StartRoot method.
+	cid := "foo"
+	if err := l.ctrl.manager.StartRoot(&cid, nil); err != nil {
+		t.Errorf("error calling StartRoot: %v", err)
+	}
+
+	// Now WaitForStartSignal should return (within a short amount of
+	// time).
+	select {
+	case <-waitFinished:
+		// OK.
+	case <-time.After(50 * time.Millisecond):
+		t.Errorf("WaitForStartSignal did not complete but it should have")
+	}
+
+}
+
+type CreateMountTestcase struct {
+	name string
+	// Spec that will be used to create the mount manager.  Note
+	// that we can't mount procfs without a kernel, so each spec
+	// MUST contain something other than procfs mounted at /proc.
+	spec specs.Spec
+	// Paths that are expected to exist in the resulting fs.
+	expectedPaths []string
+}
+
+func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+	testCases := []*CreateMountTestcase{
+		&CreateMountTestcase{
+			// Only proc.
+			name: "only proc mount",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			// /proc, /dev, and /sys should always be mounted.
+			expectedPaths: []string{"/proc", "/dev", "/sys"},
+		},
+		{
+			// Mount at a deep path, with many components that do
+			// not exist in the root.
+			name: "deep mount path",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			// /some/deep/path should be mounted, along with /proc,
+			// /dev, and /sys.
+			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+		},
+		&CreateMountTestcase{
+			// Mounts are nested inside each other.
+			name: "nested mounts",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/qux",
+						Type:        "tmpfs",
+					},
+					{
+						// File mounts with the same prefix.
+						Destination: "/foo/qux-quz",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/bar",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/bar/baz",
+						Type:        "tmpfs",
+					},
+					{
+						// A deep path that is in foo but not the other mounts.
+						Destination: "/foo/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
+				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+		},
+		&CreateMountTestcase{
+			name: "mount inside /dev",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev",
+						Type:        "tmpfs",
+					},
+					{
+						// Mounted by runsc by default.
+						Destination: "/dev/fd",
+						Type:        "tmpfs",
+					},
+					{
+						// Mount with the same prefix.
+						Destination: "/dev/fd-foo",
+						Type:        "tmpfs",
+					},
+					{
+						// Unsupported fs type.
+						Destination: "/dev/mqueue",
+						Type:        "mqueue",
+					},
+					{
+						Destination: "/dev/foo",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev/bar",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
+		},
+	}
+
+	vfsCase := &CreateMountTestcase{
+		name: "mounts inside mandatory mounts",
+		spec: specs.Spec{
+			Root: &specs.Root{
+				Path:     os.TempDir(),
+				Readonly: true,
+			},
+			Mounts: []specs.Mount{
+				{
+					Destination: "/proc",
+					Type:        "tmpfs",
+				},
+				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
+				//  MkDirAt in VFS2 (and remove the reduntant append).
+				// {
+				//		Destination: "/sys/bar",
+				//		Type:        "tmpfs",
+				//	},
+				//
+				{
+					Destination: "/tmp/baz",
+					Type:        "tmpfs",
+				},
+			},
+		},
+		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
+	}
+
+	if !vfs2 {
+		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
+		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
+	}
+	return append(testCases, vfsCase)
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespace(t *testing.T) {
+	for _, tc := range createMountTestcases(false /* vfs2 */) {
+		t.Run(tc.name, func(t *testing.T) {
+			conf := testConfig()
+			ctx := contexttest.Context(t)
+
+			sandEnd, cleanup, err := startGofer(tc.spec.Root.Path)
+			if err != nil {
+				t.Fatalf("failed to create gofer: %v", err)
+			}
+			defer cleanup()
+
+			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+			mns, err := mntr.createMountNamespace(ctx, conf)
+			if err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
+			}
+			ctx = fs.WithRoot(ctx, mns.Root())
+			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
+			}
+
+			root := mns.Root()
+			defer root.DecRef()
+			for _, p := range tc.expectedPaths {
+				maxTraversals := uint(0)
+				if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
+					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				} else {
+					d.DecRef()
+				}
+			}
+		})
+	}
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespaceVFS2(t *testing.T) {
+	for _, tc := range createMountTestcases(true /* vfs2 */) {
+		t.Run(tc.name, func(t *testing.T) {
+			spec := testSpec()
+			spec.Mounts = tc.spec.Mounts
+			spec.Root = tc.spec.Root
+
+			t.Logf("Using root: %q", spec.Root.Path)
+			l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec)
+			if err != nil {
+				t.Fatalf("failed to create loader: %v", err)
+			}
+			defer l.Destroy()
+			defer loaderCleanup()
+
+			mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+			if err := mntr.processHints(l.conf); err != nil {
+				t.Fatalf("failed process hints: %v", err)
+			}
+
+			ctx := l.k.SupervisorContext()
+			mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+			if err != nil {
+				t.Fatalf("failed to setupVFS2: %v", err)
+			}
+
+			root := mns.Root()
+			defer root.DecRef()
+			for _, p := range tc.expectedPaths {
+				target := &vfs.PathOperation{
+					Root:  root,
+					Start: root,
+					Path:  fspath.Parse(p),
+				}
+
+				if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				} else {
+					d.DecRef()
+				}
+			}
+		})
+	}
+}
+
+// TestRestoreEnvironment tests that the correct mounts are collected from the spec and config
+// in order to build the environment for restoring.
+func TestRestoreEnvironment(t *testing.T) {
+	testCases := []struct {
+		name          string
+		spec          *specs.Spec
+		ioFDs         []int
+		errorExpected bool
+		expectedRenv  fs.RestoreEnvironment
+	}{
+		{
+			name: "basic spec test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			ioFDs:         []int{0},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+						},
+					},
+					"tmpfs": {
+						{
+							Dev: "none",
+						},
+						{
+							Dev: "none",
+						},
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "bind type test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/dev/fd-foo",
+						Type:        "bind",
+					},
+				},
+			},
+			ioFDs:         []int{0, 1},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+						},
+						{
+							Dev:        "9pfs-/dev/fd-foo",
+							DataString: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating",
+						},
+					},
+					"tmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"proc": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "options test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/dev/fd-foo",
+						Type:        "tmpfs",
+						Options:     []string{"uid=1022", "noatime"},
+					},
+				},
+			},
+			ioFDs:         []int{0},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+						},
+					},
+					"tmpfs": {
+						{
+							Dev:        "none",
+							Flags:      fs.MountSourceFlags{NoAtime: true},
+							DataString: "uid=1022",
+						},
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"proc": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
+					},
+				},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			conf := testConfig()
+			mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+			actualRenv, err := mntr.createRestoreEnvironment(conf)
+			if !tc.errorExpected && err != nil {
+				t.Fatalf("could not create restore environment for test:%s", tc.name)
+			} else if tc.errorExpected {
+				if err == nil {
+					t.Errorf("expected an error, but no error occurred.")
+				}
+			} else {
+				if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) {
+					t.Errorf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv)
+				}
+			}
+		})
+	}
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
new file mode 100644
index 000000000..0af30456e
--- /dev/null
+++ b/runsc/boot/network.go
@@ -0,0 +1,338 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"net"
+	"runtime"
+	"strings"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/urpc"
+)
+
+var (
+	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
+	// "::1/8" on "lo" interface.
+	DefaultLoopbackLink = LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []Route{
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv4(0x7f, 0, 0, 0),
+					Mask: net.IPv4Mask(0xff, 0, 0, 0),
+				},
+			},
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv6loopback,
+					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+				},
+			},
+		},
+	}
+)
+
+// Network exposes methods that can be used to configure a network stack.
+type Network struct {
+	Stack *stack.Stack
+}
+
+// Route represents a route in the network stack.
+type Route struct {
+	Destination net.IPNet
+	Gateway     net.IP
+}
+
+// DefaultRoute represents a catch all route to the default gateway.
+type DefaultRoute struct {
+	Route Route
+	Name  string
+}
+
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+	// QDiscNone disables any queueing for the underlying FD.
+	QDiscNone QueueingDiscipline = iota
+
+	// QDiscFIFO applies a simple fifo based queue to the underlying
+	// FD.
+	QDiscFIFO
+)
+
+// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
+// else returns an error.
+func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
+	switch s {
+	case "none":
+		return QDiscNone, nil
+	case "fifo":
+		return QDiscFIFO, nil
+	default:
+		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
+	}
+}
+
+// String implements fmt.Stringer.
+func (q QueueingDiscipline) String() string {
+	switch q {
+	case QDiscNone:
+		return "none"
+	case QDiscFIFO:
+		return "fifo"
+	default:
+		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
+	}
+}
+
+// FDBasedLink configures an fd-based link.
+type FDBasedLink struct {
+	Name               string
+	MTU                int
+	Addresses          []net.IP
+	Routes             []Route
+	GSOMaxSize         uint32
+	SoftwareGSOEnabled bool
+	LinkAddress        net.HardwareAddr
+	QDisc              QueueingDiscipline
+
+	// NumChannels controls how many underlying FD's are to be used to
+	// create this endpoint.
+	NumChannels int
+}
+
+// LoopbackLink configures a loopback li nk.
+type LoopbackLink struct {
+	Name      string
+	Addresses []net.IP
+	Routes    []Route
+}
+
+// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
+type CreateLinksAndRoutesArgs struct {
+	// FilePayload contains the fds associated with the FDBasedLinks. The
+	// number of fd's should match the sum of the NumChannels field of the
+	// FDBasedLink entries below.
+	urpc.FilePayload
+
+	LoopbackLinks []LoopbackLink
+	FDBasedLinks  []FDBasedLink
+
+	Defaultv4Gateway DefaultRoute
+	Defaultv6Gateway DefaultRoute
+}
+
+// Empty returns true if route hasn't been set.
+func (r *Route) Empty() bool {
+	return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil
+}
+
+func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) {
+	subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask))
+	if err != nil {
+		return tcpip.Route{}, err
+	}
+	return tcpip.Route{
+		Destination: subnet,
+		Gateway:     ipToAddress(r.Gateway),
+		NIC:         id,
+	}, nil
+}
+
+// CreateLinksAndRoutes creates links and routes in a network stack.  It should
+// only be called once.
+func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
+	wantFDs := 0
+	for _, l := range args.FDBasedLinks {
+		wantFDs += l.NumChannels
+	}
+	if got := len(args.FilePayload.Files); got != wantFDs {
+		return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs)
+	}
+
+	var nicID tcpip.NICID
+	nicids := make(map[string]tcpip.NICID)
+
+	// Collect routes from all links.
+	var routes []tcpip.Route
+
+	// Loopback normally appear before other interfaces.
+	for _, link := range args.LoopbackLinks {
+		nicID++
+		nicids[link.Name] = nicID
+
+		linkEP := loopback.New()
+
+		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+			return err
+		}
+
+		// Collect the routes from this link.
+		for _, r := range link.Routes {
+			route, err := r.toTcpipRoute(nicID)
+			if err != nil {
+				return err
+			}
+			routes = append(routes, route)
+		}
+	}
+
+	fdOffset := 0
+	for _, link := range args.FDBasedLinks {
+		nicID++
+		nicids[link.Name] = nicID
+
+		FDs := []int{}
+		for j := 0; j < link.NumChannels; j++ {
+			// Copy the underlying FD.
+			oldFD := args.FilePayload.Files[fdOffset].Fd()
+			newFD, err := syscall.Dup(int(oldFD))
+			if err != nil {
+				return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+			}
+			FDs = append(FDs, newFD)
+			fdOffset++
+		}
+
+		mac := tcpip.LinkAddress(link.LinkAddress)
+		log.Infof("gso max size is: %d", link.GSOMaxSize)
+
+		linkEP, err := fdbased.New(&fdbased.Options{
+			FDs:                FDs,
+			MTU:                uint32(link.MTU),
+			EthernetHeader:     true,
+			Address:            mac,
+			PacketDispatchMode: fdbased.RecvMMsg,
+			GSOMaxSize:         link.GSOMaxSize,
+			SoftwareGSOEnabled: link.SoftwareGSOEnabled,
+			RXChecksumOffload:  true,
+		})
+		if err != nil {
+			return err
+		}
+
+		switch link.QDisc {
+		case QDiscNone:
+		case QDiscFIFO:
+			log.Infof("Enabling FIFO QDisc on %q", link.Name)
+			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
+		}
+
+		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+			return err
+		}
+
+		// Collect the routes from this link.
+		for _, r := range link.Routes {
+			route, err := r.toTcpipRoute(nicID)
+			if err != nil {
+				return err
+			}
+			routes = append(routes, route)
+		}
+	}
+
+	if !args.Defaultv4Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv4Gateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
+		}
+		route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
+	}
+
+	if !args.Defaultv6Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv6Gateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
+		}
+		route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
+	}
+
+	log.Infof("Setting routes %+v", routes)
+	n.Stack.SetRouteTable(routes)
+	return nil
+}
+
+// createNICWithAddrs creates a NIC in the network stack and adds the given
+// addresses.
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error {
+	opts := stack.NICOptions{Name: name}
+	if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil {
+		return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
+	}
+
+	// Always start with an arp address for the NIC.
+	if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+		return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
+	}
+
+	for _, addr := range addrs {
+		proto, tcpipAddr := ipToAddressAndProto(addr)
+		if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+			return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
+		}
+	}
+	return nil
+}
+
+// ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
+//
+// Note: don't use 'len(ip)' to determine IP version because length is always 16.
+func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
+	if i4 := ip.To4(); i4 != nil {
+		return ipv4.ProtocolNumber, tcpip.Address(i4)
+	}
+	return ipv6.ProtocolNumber, tcpip.Address(ip)
+}
+
+// ipToAddress converts IP to tcpip.Address, ignoring the protocol.
+func ipToAddress(ip net.IP) tcpip.Address {
+	_, addr := ipToAddressAndProto(ip)
+	return addr
+}
+
+// ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the
+// protocol.
+func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask {
+	return tcpip.AddressMask(ipToAddress(net.IP(ipMask)))
+}
diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD
new file mode 100644
index 000000000..77774f43c
--- /dev/null
+++ b/runsc/boot/platforms/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "platforms",
+    srcs = ["platforms.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+    ],
+)
diff --git a/runsc/boot/platforms/platforms.go b/runsc/boot/platforms/platforms.go
new file mode 100644
index 000000000..056b46ad5
--- /dev/null
+++ b/runsc/boot/platforms/platforms.go
@@ -0,0 +1,30 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package platforms imports all available platform packages.
+package platforms
+
+import (
+	// Import platforms that runsc might use.
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+)
+
+const (
+	// Ptrace runs the sandbox with the ptrace platform.
+	Ptrace = "ptrace"
+
+	// KVM runs the sandbox with the KVM platform.
+	KVM = "kvm"
+)
diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD
new file mode 100644
index 000000000..29cb42b2f
--- /dev/null
+++ b/runsc/boot/pprof/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pprof",
+    srcs = ["pprof.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+)
diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go
new file mode 100644
index 000000000..1ded20dee
--- /dev/null
+++ b/runsc/boot/pprof/pprof.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pprof provides a stub to initialize custom profilers.
+package pprof
+
+// Initialize will be called at boot for initializing custom profilers.
+func Initialize() {
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
new file mode 100644
index 000000000..fbfd3b07c
--- /dev/null
+++ b/runsc/boot/strace.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+func enableStrace(conf *Config) error {
+	// We must initialize even if strace is not enabled.
+	strace.Initialize()
+
+	if !conf.Strace {
+		return nil
+	}
+
+	max := conf.StraceLogSize
+	if max == 0 {
+		max = 1024
+	}
+	strace.LogMaximumSize = max
+
+	if len(conf.StraceSyscalls) == 0 {
+		strace.EnableAll(strace.SinkTypeLog)
+		return nil
+	}
+	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
new file mode 100644
index 000000000..d1397ed2c
--- /dev/null
+++ b/runsc/boot/vfs.go
@@ -0,0 +1,343 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"path"
+	"sort"
+	"strconv"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	devpts2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
+	devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	sysimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	tmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+	vfsObj.MustRegisterFilesystemType(devpts2.Name, &devpts2.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+		// TODO(b/29356795): Users may mount this once the terminals are in a
+		//  usable state.
+		AllowUserMount: false,
+	})
+	vfsObj.MustRegisterFilesystemType(devtmpfsimpl.Name, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(goferimpl.Name, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+	})
+	vfsObj.MustRegisterFilesystemType(procimpl.Name, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(sysimpl.Name, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(tmpfsimpl.Name, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	// Setup files in devtmpfs.
+	if err := memdev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering memdev: %w", err)
+	}
+	a, err := devtmpfsimpl.NewAccessor(ctx, vfsObj, creds, devtmpfsimpl.Name)
+	if err != nil {
+		return fmt.Errorf("creating devtmpfs accessor: %w", err)
+	}
+	defer a.Release()
+
+	if err := a.UserspaceInit(ctx); err != nil {
+		return fmt.Errorf("initializing userspace: %w", err)
+	}
+	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating devtmpfs files: %w", err)
+	}
+	return nil
+}
+
+func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if err := mntr.k.VFS().Init(); err != nil {
+		return fmt.Errorf("failed to initialize VFS: %w", err)
+	}
+	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+	if err != nil {
+		return fmt.Errorf("failed to setupFS: %w", err)
+	}
+	procArgs.MountNamespaceVFS2 = mns
+	return setExecutablePathVFS2(ctx, procArgs)
+}
+
+func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
+	exe := procArgs.Argv[0]
+
+	// Absolute paths can be used directly.
+	if path.IsAbs(exe) {
+		procArgs.Filename = exe
+		return nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(exe, '/') > 0 {
+		if !path.IsAbs(procArgs.WorkingDirectory) {
+			return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory)
+		}
+		procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
+		return nil
+	}
+
+	// Paths with a '/' are relative to the CWD.
+	if strings.IndexByte(exe, '/') > 0 {
+		procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
+		return nil
+	}
+
+	// Otherwise, We must lookup the name in the paths, starting from the
+	// root directory.
+	root := procArgs.MountNamespaceVFS2.Root()
+	defer root.DecRef()
+
+	paths := fs.GetPath(procArgs.Envv)
+	creds := procArgs.Credentials
+
+	for _, p := range paths {
+		binPath := path.Join(p, exe)
+		pop := &vfs.PathOperation{
+			Root:               root,
+			Start:              root,
+			Path:               fspath.Parse(binPath),
+			FollowFinalSymlink: true,
+		}
+		opts := &vfs.OpenOptions{
+			FileExec: true,
+			Flags:    linux.O_RDONLY,
+		}
+		dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return err
+		}
+		dentry.DecRef()
+
+		procArgs.Filename = binPath
+		return nil
+	}
+
+	return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":"))
+}
+
+func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+	log.Infof("Configuring container's file system with VFS2")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = rootCreds
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := procArgs.NewContext(c.k)
+
+	if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
+		return nil, fmt.Errorf("register filesystems: %w", err)
+	}
+
+	mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount namespace: %w", err)
+	}
+	rootProcArgs.MountNamespaceVFS2 = mns
+
+	// Mount submounts.
+	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
+		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+	fd := c.fds.remove()
+	opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",")
+
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts})
+	if err != nil {
+		return nil, fmt.Errorf("setting up mount namespace: %w", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+	c.prepareMountsVFS2()
+
+	for _, submount := range c.mounts {
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil {
+			return err
+		}
+	}
+
+	// TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go.
+
+	return c.checkDispenser()
+}
+
+func (c *containerMounter) prepareMountsVFS2() {
+	// Sort the mounts so that we don't place children before parents.
+	sort.Slice(c.mounts, func(i, j int) bool { return len(c.mounts[i].Destination) < len(c.mounts[j].Destination) })
+}
+
+// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version.
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error {
+	root := mns.Root()
+	defer root.DecRef()
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(submount.Destination),
+	}
+
+	fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount)
+	if err != nil {
+		return fmt.Errorf("mountOptions failed: %w", err)
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
+		return err
+	}
+	log.Debugf("directory exists or made directory for submount: %s", submount.Destination)
+
+	opts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(options, ","),
+		},
+		InternalMount: true,
+	}
+
+	// All writes go to upper, be paranoid and make lower readonly.
+	opts.ReadOnly = useOverlay
+
+	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
+		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+	}
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts)
+	return nil
+}
+
+// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m specs.Mount) (string, []string, bool, error) {
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+	)
+
+	switch m.Type {
+	case devpts, devtmpfs, proc, sysfs:
+		fsName = m.Type
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
+		fsName = m.Type
+
+		var err error
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		if err != nil {
+			return "", nil, false, err
+		}
+
+	case bind:
+		fd := c.fds.remove()
+		fsName = "9p"
+		opts = p9MountOptionsVFS2(fd, c.getMountAccessType(m))
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+	default:
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, opts, useOverlay, nil
+}
+
+// p9MountOptions creates a slice of options for a p9 mount.
+// TODO(gvisor.dev/issue/1200): Remove this version in favor of the one in
+// fs.go when privateunixsocket lands.
+func p9MountOptionsVFS2(fd int, fa FileAccessType) []string {
+	opts := []string{
+		"trans=fd",
+		"rfdno=" + strconv.Itoa(fd),
+		"wfdno=" + strconv.Itoa(fd),
+	}
+	if fa == FileAccessShared {
+		opts = append(opts, "cache=remote_revalidating")
+	}
+	return opts
+}
+
+func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(currentPath),
+	}
+
+	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
+	switch {
+	case err == syserror.ENOENT:
+		if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
+			return err
+		}
+		mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+		if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
+			return fmt.Errorf("failed to makedir for mount %+v: %w", target, err)
+		}
+		return nil
+
+	case err != nil:
+		return fmt.Errorf("stat failed for mount %+v: %w", target, err)
+
+	default:
+		return nil
+	}
+}
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
new file mode 100644
index 000000000..d4c7bdfbb
--- /dev/null
+++ b/runsc/cgroup/BUILD
@@ -0,0 +1,23 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "cgroup",
+    srcs = ["cgroup.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/log",
+        "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
+
+go_test(
+    name = "cgroup_test",
+    size = "small",
+    srcs = ["cgroup_test.go"],
+    library = ":cgroup",
+    tags = ["local"],
+)
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
new file mode 100644
index 000000000..fa40ee509
--- /dev/null
+++ b/runsc/cgroup/cgroup.go
@@ -0,0 +1,537 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cgroup provides an interface to read and write configuration to
+// cgroup.
+package cgroup
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+const (
+	cgroupRoot = "/sys/fs/cgroup"
+)
+
+var controllers = map[string]controller{
+	"blkio":    &blockIO{},
+	"cpu":      &cpu{},
+	"cpuset":   &cpuSet{},
+	"memory":   &memory{},
+	"net_cls":  &networkClass{},
+	"net_prio": &networkPrio{},
+	"pids":     &pids{},
+
+	// These controllers either don't have anything in the OCI spec or is
+	// irrelevant for a sandbox.
+	"devices":    &noop{},
+	"freezer":    &noop{},
+	"perf_event": &noop{},
+	"systemd":    &noop{},
+}
+
+func setOptionalValueInt(path, name string, val *int64) error {
+	if val == nil || *val == 0 {
+		return nil
+	}
+	str := strconv.FormatInt(*val, 10)
+	return setValue(path, name, str)
+}
+
+func setOptionalValueUint(path, name string, val *uint64) error {
+	if val == nil || *val == 0 {
+		return nil
+	}
+	str := strconv.FormatUint(*val, 10)
+	return setValue(path, name, str)
+}
+
+func setOptionalValueUint32(path, name string, val *uint32) error {
+	if val == nil || *val == 0 {
+		return nil
+	}
+	str := strconv.FormatUint(uint64(*val), 10)
+	return setValue(path, name, str)
+}
+
+func setOptionalValueUint16(path, name string, val *uint16) error {
+	if val == nil || *val == 0 {
+		return nil
+	}
+	str := strconv.FormatUint(uint64(*val), 10)
+	return setValue(path, name, str)
+}
+
+func setValue(path, name, data string) error {
+	fullpath := filepath.Join(path, name)
+	return ioutil.WriteFile(fullpath, []byte(data), 0700)
+}
+
+func getValue(path, name string) (string, error) {
+	fullpath := filepath.Join(path, name)
+	out, err := ioutil.ReadFile(fullpath)
+	if err != nil {
+		return "", err
+	}
+	return string(out), nil
+}
+
+func getInt(path, name string) (int, error) {
+	s, err := getValue(path, name)
+	if err != nil {
+		return 0, err
+	}
+	return strconv.Atoi(strings.TrimSpace(s))
+}
+
+// fillFromAncestor sets the value of a cgroup file from the first ancestor
+// that has content. It does nothing if the file in 'path' has already been set.
+func fillFromAncestor(path string) (string, error) {
+	out, err := ioutil.ReadFile(path)
+	if err != nil {
+		return "", err
+	}
+	val := strings.TrimSpace(string(out))
+	if val != "" {
+		// File is set, stop here.
+		return val, nil
+	}
+
+	// File is not set, recurse to parent and then  set here.
+	name := filepath.Base(path)
+	parent := filepath.Dir(filepath.Dir(path))
+	val, err = fillFromAncestor(filepath.Join(parent, name))
+	if err != nil {
+		return "", err
+	}
+	if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil {
+		return "", err
+	}
+	return val, nil
+}
+
+// countCpuset returns the number of CPU in a string formatted like:
+// 		"0-2,7,12-14  # bits 0, 1, 2, 7, 12, 13, and 14 set" - man 7 cpuset
+func countCpuset(cpuset string) (int, error) {
+	var count int
+	for _, p := range strings.Split(cpuset, ",") {
+		interval := strings.Split(p, "-")
+		switch len(interval) {
+		case 1:
+			if _, err := strconv.Atoi(interval[0]); err != nil {
+				return 0, err
+			}
+			count++
+
+		case 2:
+			start, err := strconv.Atoi(interval[0])
+			if err != nil {
+				return 0, err
+			}
+			end, err := strconv.Atoi(interval[1])
+			if err != nil {
+				return 0, err
+			}
+			if start < 0 || end < 0 || start > end {
+				return 0, fmt.Errorf("invalid cpuset: %q", p)
+			}
+			count += end - start + 1
+
+		default:
+			return 0, fmt.Errorf("invalid cpuset: %q", p)
+		}
+	}
+	return count, nil
+}
+
+// LoadPaths loads cgroup paths for given 'pid', may be set to 'self'.
+func LoadPaths(pid string) (map[string]string, error) {
+	f, err := os.Open(filepath.Join("/proc", pid, "cgroup"))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	paths := make(map[string]string)
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		// Format: ID:controller1,controller2:path
+		// Example: 2:cpu,cpuacct:/user.slice
+		tokens := strings.Split(scanner.Text(), ":")
+		if len(tokens) != 3 {
+			return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text())
+		}
+		for _, ctrlr := range strings.Split(tokens[1], ",") {
+			paths[ctrlr] = tokens[2]
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+	return paths, nil
+}
+
+// Cgroup represents a group inside all controllers. For example: Name='/foo/bar'
+// maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers.
+type Cgroup struct {
+	Name    string            `json:"name"`
+	Parents map[string]string `json:"parents"`
+	Own     bool              `json:"own"`
+}
+
+// New creates a new Cgroup instance if the spec includes a cgroup path.
+// Returns nil otherwise.
+func New(spec *specs.Spec) (*Cgroup, error) {
+	if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
+		return nil, nil
+	}
+	var parents map[string]string
+	if !filepath.IsAbs(spec.Linux.CgroupsPath) {
+		var err error
+		parents, err = LoadPaths("self")
+		if err != nil {
+			return nil, fmt.Errorf("finding current cgroups: %v", err)
+		}
+	}
+	return &Cgroup{
+		Name:    spec.Linux.CgroupsPath,
+		Parents: parents,
+	}, nil
+}
+
+// Install creates and configures cgroups according to 'res'. If cgroup path
+// already exists, it means that the caller has already provided a
+// pre-configured cgroups, and 'res' is ignored.
+func (c *Cgroup) Install(res *specs.LinuxResources) error {
+	if _, err := os.Stat(c.makePath("memory")); err == nil {
+		// If cgroup has already been created; it has been setup by caller. Don't
+		// make any changes to configuration, just join when sandbox/gofer starts.
+		log.Debugf("Using pre-created cgroup %q", c.Name)
+		return nil
+	}
+
+	log.Debugf("Creating cgroup %q", c.Name)
+
+	// Mark that cgroup resources are owned by me.
+	c.Own = true
+
+	// The Cleanup object cleans up partially created cgroups when an error occurs.
+	// Errors occuring during cleanup itself are ignored.
+	clean := specutils.MakeCleanup(func() { _ = c.Uninstall() })
+	defer clean.Clean()
+
+	for key, ctrl := range controllers {
+		path := c.makePath(key)
+		if err := os.MkdirAll(path, 0755); err != nil {
+			return err
+		}
+		if res != nil {
+			if err := ctrl.set(res, path); err != nil {
+				return err
+			}
+		}
+	}
+	clean.Release()
+	return nil
+}
+
+// Uninstall removes the settings done in Install(). If cgroup path already
+// existed when Install() was called, Uninstall is a noop.
+func (c *Cgroup) Uninstall() error {
+	if !c.Own {
+		// cgroup is managed by caller, don't touch it.
+		return nil
+	}
+	log.Debugf("Deleting cgroup %q", c.Name)
+	for key := range controllers {
+		path := c.makePath(key)
+		log.Debugf("Removing cgroup controller for key=%q path=%q", key, path)
+
+		// If we try to remove the cgroup too soon after killing the
+		// sandbox we might get EBUSY, so we retry for a few seconds
+		// until it succeeds.
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+		if err := backoff.Retry(func() error {
+			err := syscall.Rmdir(path)
+			if os.IsNotExist(err) {
+				return nil
+			}
+			return err
+		}, b); err != nil {
+			return fmt.Errorf("removing cgroup path %q: %v", path, err)
+		}
+	}
+	return nil
+}
+
+// Join adds the current process to the all controllers. Returns function that
+// restores cgroup to the original state.
+func (c *Cgroup) Join() (func(), error) {
+	// First save the current state so it can be restored.
+	undo := func() {}
+	paths, err := LoadPaths("self")
+	if err != nil {
+		return undo, err
+	}
+	var undoPaths []string
+	for ctrlr, path := range paths {
+		// Skip controllers we don't handle.
+		if _, ok := controllers[ctrlr]; ok {
+			fullPath := filepath.Join(cgroupRoot, ctrlr, path)
+			undoPaths = append(undoPaths, fullPath)
+			break
+		}
+	}
+
+	// Replace empty undo with the real thing before changes are made to cgroups.
+	undo = func() {
+		for _, path := range undoPaths {
+			log.Debugf("Restoring cgroup %q", path)
+			if err := setValue(path, "cgroup.procs", "0"); err != nil {
+				log.Warningf("Error restoring cgroup %q: %v", path, err)
+			}
+		}
+	}
+
+	// Now join the cgroups.
+	for key := range controllers {
+		path := c.makePath(key)
+		log.Debugf("Joining cgroup %q", path)
+		if err := setValue(path, "cgroup.procs", "0"); err != nil {
+			return undo, err
+		}
+	}
+	return undo, nil
+}
+
+func (c *Cgroup) CPUQuota() (float64, error) {
+	path := c.makePath("cpu")
+	quota, err := getInt(path, "cpu.cfs_quota_us")
+	if err != nil {
+		return -1, err
+	}
+	period, err := getInt(path, "cpu.cfs_period_us")
+	if err != nil {
+		return -1, err
+	}
+	if quota <= 0 || period <= 0 {
+		return -1, err
+	}
+	return float64(quota) / float64(period), nil
+}
+
+// NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
+func (c *Cgroup) NumCPU() (int, error) {
+	path := c.makePath("cpuset")
+	cpuset, err := getValue(path, "cpuset.cpus")
+	if err != nil {
+		return 0, err
+	}
+	return countCpuset(strings.TrimSpace(cpuset))
+}
+
+// MemoryLimit returns the memory limit.
+func (c *Cgroup) MemoryLimit() (uint64, error) {
+	path := c.makePath("memory")
+	limStr, err := getValue(path, "memory.limit_in_bytes")
+	if err != nil {
+		return 0, err
+	}
+	return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64)
+}
+
+func (c *Cgroup) makePath(controllerName string) string {
+	path := c.Name
+	if parent, ok := c.Parents[controllerName]; ok {
+		path = filepath.Join(parent, c.Name)
+	}
+	return filepath.Join(cgroupRoot, controllerName, path)
+}
+
+type controller interface {
+	set(*specs.LinuxResources, string) error
+}
+
+type noop struct{}
+
+func (*noop) set(*specs.LinuxResources, string) error {
+	return nil
+}
+
+type memory struct{}
+
+func (*memory) set(spec *specs.LinuxResources, path string) error {
+	if spec.Memory == nil {
+		return nil
+	}
+	if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "memory.soft_limit_in_bytes", spec.Memory.Reservation); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "memory.memsw.limit_in_bytes", spec.Memory.Swap); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "memory.kmem.limit_in_bytes", spec.Memory.Kernel); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "memory.kmem.tcp.limit_in_bytes", spec.Memory.KernelTCP); err != nil {
+		return err
+	}
+	if err := setOptionalValueUint(path, "memory.swappiness", spec.Memory.Swappiness); err != nil {
+		return err
+	}
+
+	if spec.Memory.DisableOOMKiller != nil && *spec.Memory.DisableOOMKiller {
+		if err := setValue(path, "memory.oom_control", "1"); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+type cpu struct{}
+
+func (*cpu) set(spec *specs.LinuxResources, path string) error {
+	if spec.CPU == nil {
+		return nil
+	}
+	if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil {
+		return err
+	}
+	if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil {
+		return err
+	}
+	return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period)
+}
+
+type cpuSet struct{}
+
+func (*cpuSet) set(spec *specs.LinuxResources, path string) error {
+	// cpuset.cpus and mems are required fields, but are not set on a new cgroup.
+	// If not set in the spec, get it from one of the ancestors cgroup.
+	if spec.CPU == nil || spec.CPU.Cpus == "" {
+		if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil {
+			return err
+		}
+	} else {
+		if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil {
+			return err
+		}
+	}
+
+	if spec.CPU == nil || spec.CPU.Mems == "" {
+		_, err := fillFromAncestor(filepath.Join(path, "cpuset.mems"))
+		return err
+	}
+	mems := spec.CPU.Mems
+	return setValue(path, "cpuset.mems", mems)
+}
+
+type blockIO struct{}
+
+func (*blockIO) set(spec *specs.LinuxResources, path string) error {
+	if spec.BlockIO == nil {
+		return nil
+	}
+
+	if err := setOptionalValueUint16(path, "blkio.weight", spec.BlockIO.Weight); err != nil {
+		return err
+	}
+	if err := setOptionalValueUint16(path, "blkio.leaf_weight", spec.BlockIO.LeafWeight); err != nil {
+		return err
+	}
+
+	for _, dev := range spec.BlockIO.WeightDevice {
+		val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight)
+		if err := setValue(path, "blkio.weight_device", val); err != nil {
+			return err
+		}
+		val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight)
+		if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
+			return err
+		}
+	}
+	if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil {
+		return err
+	}
+	if err := setThrottle(path, "blkio.throttle.write_bps_device", spec.BlockIO.ThrottleWriteBpsDevice); err != nil {
+		return err
+	}
+	if err := setThrottle(path, "blkio.throttle.read_iops_device", spec.BlockIO.ThrottleReadIOPSDevice); err != nil {
+		return err
+	}
+	return setThrottle(path, "blkio.throttle.write_iops_device", spec.BlockIO.ThrottleWriteIOPSDevice)
+}
+
+func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error {
+	for _, dev := range devs {
+		val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate)
+		if err := setValue(path, name, val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+type networkClass struct{}
+
+func (*networkClass) set(spec *specs.LinuxResources, path string) error {
+	if spec.Network == nil {
+		return nil
+	}
+	return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID)
+}
+
+type networkPrio struct{}
+
+func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
+	if spec.Network == nil {
+		return nil
+	}
+	for _, prio := range spec.Network.Priorities {
+		val := fmt.Sprintf("%s %d", prio.Name, prio.Priority)
+		if err := setValue(path, "net_prio.ifpriomap", val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+type pids struct{}
+
+func (*pids) set(spec *specs.LinuxResources, path string) error {
+	if spec.Pids == nil {
+		return nil
+	}
+	val := strconv.FormatInt(spec.Pids.Limit, 10)
+	return setValue(path, "pids.max", val)
+}
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
new file mode 100644
index 000000000..548c80e9a
--- /dev/null
+++ b/runsc/cgroup/cgroup_test.go
@@ -0,0 +1,67 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cgroup
+
+import (
+	"testing"
+)
+
+func TestUninstallEnoent(t *testing.T) {
+	c := Cgroup{
+		// set a non-existent name
+		Name: "runsc-test-uninstall-656e6f656e740a",
+		Own:  true,
+	}
+	if err := c.Uninstall(); err != nil {
+		t.Errorf("Uninstall() failed: %v", err)
+	}
+}
+
+func TestCountCpuset(t *testing.T) {
+	for _, tc := range []struct {
+		str   string
+		want  int
+		error bool
+	}{
+		{str: "0", want: 1},
+		{str: "0,1,2,8,9,10", want: 6},
+		{str: "0-1", want: 2},
+		{str: "0-7", want: 8},
+		{str: "0-7,16,32-39,64,65", want: 19},
+		{str: "a", error: true},
+		{str: "5-a", error: true},
+		{str: "a-5", error: true},
+		{str: "-10", error: true},
+		{str: "15-", error: true},
+		{str: "-", error: true},
+		{str: "--", error: true},
+	} {
+		t.Run(tc.str, func(t *testing.T) {
+			got, err := countCpuset(tc.str)
+			if tc.error {
+				if err == nil {
+					t.Errorf("countCpuset(%q) should have failed", tc.str)
+				}
+			} else {
+				if err != nil {
+					t.Errorf("countCpuset(%q) failed: %v", tc.str, err)
+				}
+				if tc.want != got {
+					t.Errorf("countCpuset(%q) want: %d, got: %d", tc.str, tc.want, got)
+				}
+			}
+		})
+	}
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
new file mode 100644
index 000000000..af3538ef0
--- /dev/null
+++ b/runsc/cmd/BUILD
@@ -0,0 +1,95 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "cmd",
+    srcs = [
+        "boot.go",
+        "capability.go",
+        "checkpoint.go",
+        "chroot.go",
+        "cmd.go",
+        "create.go",
+        "debug.go",
+        "delete.go",
+        "do.go",
+        "error.go",
+        "events.go",
+        "exec.go",
+        "gofer.go",
+        "help.go",
+        "install.go",
+        "kill.go",
+        "list.go",
+        "path.go",
+        "pause.go",
+        "ps.go",
+        "restore.go",
+        "resume.go",
+        "run.go",
+        "spec.go",
+        "start.go",
+        "state.go",
+        "statefile.go",
+        "syscalls.go",
+        "wait.go",
+    ],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/platform",
+        "//pkg/state",
+        "//pkg/state/statefile",
+        "//pkg/sync",
+        "//pkg/unet",
+        "//pkg/urpc",
+        "//runsc/boot",
+        "//runsc/console",
+        "//runsc/container",
+        "//runsc/flag",
+        "//runsc/fsgofer",
+        "//runsc/fsgofer/filter",
+        "//runsc/specutils",
+        "@com_github_google_subcommands//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "cmd_test",
+    size = "small",
+    srcs = [
+        "capability_test.go",
+        "delete_test.go",
+        "exec_test.go",
+        "gofer_test.go",
+    ],
+    data = [
+        "//runsc",
+    ],
+    library = ":cmd",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/test/testutil",
+        "//pkg/urpc",
+        "//runsc/boot",
+        "//runsc/container",
+        "//runsc/specutils",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
+    ],
+)
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
new file mode 100644
index 000000000..4c2ac6ff0
--- /dev/null
+++ b/runsc/cmd/boot.go
@@ -0,0 +1,290 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"os"
+	"runtime/debug"
+	"strings"
+	"syscall"
+
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// Boot implements subcommands.Command for the "boot" command which starts a
+// new sandbox. It should not be called directly.
+type Boot struct {
+	// bundleDir is the directory containing the OCI spec.
+	bundleDir string
+
+	// specFD is the file descriptor that the spec will be read from.
+	specFD int
+
+	// controllerFD is the file descriptor of a stream socket for the
+	// control server that is donated to this process.
+	controllerFD int
+
+	// deviceFD is the file descriptor for the platform device file.
+	deviceFD int
+
+	// ioFDs is the list of FDs used to connect to FS gofers.
+	ioFDs intFlags
+
+	// stdioFDs are the fds for stdin, stdout, and stderr. They must be
+	// provided in that order.
+	stdioFDs intFlags
+
+	// console is set to true if the sandbox should allow terminal ioctl(2)
+	// syscalls.
+	console bool
+
+	// applyCaps determines if capabilities defined in the spec should be applied
+	// to the process.
+	applyCaps bool
+
+	// setUpChroot is set to true if the sandbox is started in an empty root.
+	setUpRoot bool
+
+	// cpuNum number of CPUs to create inside the sandbox.
+	cpuNum int
+
+	// totalMem sets the initial amount of total memory to report back to the
+	// container.
+	totalMem uint64
+
+	// userLogFD is the file descriptor to write user logs to.
+	userLogFD int
+
+	// startSyncFD is the file descriptor to synchronize runsc and sandbox.
+	startSyncFD int
+
+	// mountsFD is the file descriptor to read list of mounts after they have
+	// been resolved (direct paths, no symlinks). They are resolved outside the
+	// sandbox (e.g. gofer) and sent through this FD.
+	mountsFD int
+
+	// pidns is set if the sandbox is in its own pid namespace.
+	pidns bool
+
+	// attached is set to true to kill the sandbox process when the parent process
+	// terminates. This flag is set when the command execve's itself because
+	// parent death signal doesn't propagate through execve when uid/gid changes.
+	attached bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Boot) Name() string {
+	return "boot"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Boot) Synopsis() string {
+	return "launch a sandbox process (internal use only)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Boot) Usage() string {
+	return `boot [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (b *Boot) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
+	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
+	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
+	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
+	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
+	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
+	f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
+	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
+	f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
+	f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
+	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
+	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
+	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
+	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
+	f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
+	f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
+}
+
+// Execute implements subcommands.Command.Execute.  It starts a sandbox in a
+// waiting state.
+func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	// Ensure that if there is a panic, all goroutine stacks are printed.
+	debug.SetTraceback("all")
+
+	conf := args[0].(*boot.Config)
+
+	if b.attached {
+		// Ensure this process is killed after parent process terminates when
+		// attached mode is enabled. In the unfortunate event that the parent
+		// terminates before this point, this process leaks.
+		if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
+			Fatalf("error setting parent death signal: %v", err)
+		}
+	}
+
+	if b.setUpRoot {
+		if err := setUpChroot(b.pidns); err != nil {
+			Fatalf("error setting up chroot: %v", err)
+		}
+
+		if !b.applyCaps && !conf.Rootless {
+			// Remove --apply-caps arg to call myself. It has already been done.
+			args := prepareArgs(b.attached, "setup-root")
+
+			// Note that we've already read the spec from the spec FD, and
+			// we will read it again after the exec call. This works
+			// because the ReadSpecFromFile function seeks to the beginning
+			// of the file before reading.
+			if err := callSelfAsNobody(args); err != nil {
+				Fatalf("%v", err)
+			}
+			panic("callSelfAsNobody must never return success")
+		}
+	}
+
+	// Get the spec from the specFD.
+	specFile := os.NewFile(uintptr(b.specFD), "spec file")
+	defer specFile.Close()
+	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile)
+	if err != nil {
+		Fatalf("reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	if b.applyCaps {
+		caps := spec.Process.Capabilities
+		if caps == nil {
+			caps = &specs.LinuxCapabilities{}
+		}
+
+		gPlatform, err := platform.Lookup(conf.Platform)
+		if err != nil {
+			Fatalf("loading platform: %v", err)
+		}
+		if gPlatform.Requirements().RequiresCapSysPtrace {
+			// Ptrace platform requires extra capabilities.
+			const c = "CAP_SYS_PTRACE"
+			caps.Bounding = append(caps.Bounding, c)
+			caps.Effective = append(caps.Effective, c)
+			caps.Permitted = append(caps.Permitted, c)
+		}
+
+		// Remove --apply-caps and --setup-root arg to call myself. Both have
+		// already been done.
+		args := prepareArgs(b.attached, "setup-root", "apply-caps")
+
+		// Note that we've already read the spec from the spec FD, and
+		// we will read it again after the exec call. This works
+		// because the ReadSpecFromFile function seeks to the beginning
+		// of the file before reading.
+		if err := setCapsAndCallSelf(args, caps); err != nil {
+			Fatalf("%v", err)
+		}
+		panic("setCapsAndCallSelf must never return success")
+	}
+
+	// Read resolved mount list and replace the original one from the spec.
+	mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file")
+	cleanMounts, err := specutils.ReadMounts(mountsFile)
+	if err != nil {
+		mountsFile.Close()
+		Fatalf("Error reading mounts file: %v", err)
+	}
+	mountsFile.Close()
+	spec.Mounts = cleanMounts
+
+	// Create the loader.
+	bootArgs := boot.Args{
+		ID:           f.Arg(0),
+		Spec:         spec,
+		Conf:         conf,
+		ControllerFD: b.controllerFD,
+		Device:       os.NewFile(uintptr(b.deviceFD), "platform device"),
+		GoferFDs:     b.ioFDs.GetArray(),
+		StdioFDs:     b.stdioFDs.GetArray(),
+		Console:      b.console,
+		NumCPU:       b.cpuNum,
+		TotalMem:     b.totalMem,
+		UserLogFD:    b.userLogFD,
+	}
+	l, err := boot.New(bootArgs)
+	if err != nil {
+		Fatalf("creating loader: %v", err)
+	}
+
+	// Fatalf exits the process and doesn't run defers.
+	// 'l' must be destroyed explicitly after this point!
+
+	// Notify the parent process the sandbox has booted (and that the controller
+	// is up).
+	startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
+	buf := make([]byte, 1)
+	if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
+		l.Destroy()
+		Fatalf("unable to write into the start-sync descriptor: %v", err)
+	}
+	// Closes startSyncFile because 'l.Run()' only returns when the sandbox exits.
+	startSyncFile.Close()
+
+	// Wait for the start signal from runsc.
+	l.WaitForStartSignal()
+
+	// Run the application and wait for it to finish.
+	if err := l.Run(); err != nil {
+		l.Destroy()
+		Fatalf("running sandbox: %v", err)
+	}
+
+	ws := l.WaitExit()
+	log.Infof("application exiting with %+v", ws)
+	waitStatus := args[1].(*syscall.WaitStatus)
+	*waitStatus = syscall.WaitStatus(ws.Status())
+	l.Destroy()
+	return subcommands.ExitSuccess
+}
+
+func prepareArgs(attached bool, exclude ...string) []string {
+	var args []string
+	for _, arg := range os.Args {
+		for _, excl := range exclude {
+			if strings.Contains(arg, excl) {
+				goto skip
+			}
+		}
+		args = append(args, arg)
+		if attached && arg == "boot" {
+			// Strategicaly place "--attached" after the command. This is needed
+			// to ensure the new process is killed when the parent process terminates.
+			args = append(args, "--attached")
+		}
+	skip:
+	}
+	return args
+}
diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go
new file mode 100644
index 000000000..abfbb7cfc
--- /dev/null
+++ b/runsc/cmd/capability.go
@@ -0,0 +1,157 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+var allCapTypes = []capability.CapType{
+	capability.BOUNDS,
+	capability.EFFECTIVE,
+	capability.PERMITTED,
+	capability.INHERITABLE,
+	capability.AMBIENT,
+}
+
+// applyCaps applies the capabilities in the spec to the current thread.
+//
+// Note that it must be called with current thread locked.
+func applyCaps(caps *specs.LinuxCapabilities) error {
+	// Load current capabilities to trim the ones not permitted.
+	curCaps, err := capability.NewPid2(0)
+	if err != nil {
+		return err
+	}
+	if err := curCaps.Load(); err != nil {
+		return err
+	}
+
+	// Create an empty capability set to populate.
+	newCaps, err := capability.NewPid2(0)
+	if err != nil {
+		return err
+	}
+
+	for _, c := range allCapTypes {
+		if !newCaps.Empty(c) {
+			panic("unloaded capabilities must be empty")
+		}
+		set, err := trimCaps(getCaps(c, caps), curCaps)
+		if err != nil {
+			return err
+		}
+		newCaps.Set(c, set...)
+	}
+
+	if err := newCaps.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS); err != nil {
+		return err
+	}
+	log.Infof("Capabilities applied: %+v", newCaps)
+	return nil
+}
+
+func getCaps(which capability.CapType, caps *specs.LinuxCapabilities) []string {
+	switch which {
+	case capability.BOUNDS:
+		return caps.Bounding
+	case capability.EFFECTIVE:
+		return caps.Effective
+	case capability.PERMITTED:
+		return caps.Permitted
+	case capability.INHERITABLE:
+		return caps.Inheritable
+	case capability.AMBIENT:
+		return caps.Ambient
+	}
+	panic(fmt.Sprint("invalid capability type:", which))
+}
+
+func trimCaps(names []string, setter capability.Capabilities) ([]capability.Cap, error) {
+	wantedCaps, err := capsFromNames(names)
+	if err != nil {
+		return nil, err
+	}
+
+	// Trim down capabilities that aren't possible to acquire.
+	var caps []capability.Cap
+	for _, c := range wantedCaps {
+		// Capability rules are more complicated than this, but this catches most
+		// problems with tests running with non-privileged user.
+		if setter.Get(capability.PERMITTED, c) {
+			caps = append(caps, c)
+		} else {
+			log.Warningf("Capability %q is not permitted, dropping it.", c)
+		}
+	}
+	return caps, nil
+}
+
+func capsFromNames(names []string) ([]capability.Cap, error) {
+	var caps []capability.Cap
+	for _, name := range names {
+		cap, ok := capFromName[name]
+		if !ok {
+			return nil, fmt.Errorf("invalid capability %q", name)
+		}
+		caps = append(caps, cap)
+	}
+	return caps, nil
+}
+
+var capFromName = map[string]capability.Cap{
+	"CAP_CHOWN":            capability.CAP_CHOWN,
+	"CAP_DAC_OVERRIDE":     capability.CAP_DAC_OVERRIDE,
+	"CAP_DAC_READ_SEARCH":  capability.CAP_DAC_READ_SEARCH,
+	"CAP_FOWNER":           capability.CAP_FOWNER,
+	"CAP_FSETID":           capability.CAP_FSETID,
+	"CAP_KILL":             capability.CAP_KILL,
+	"CAP_SETGID":           capability.CAP_SETGID,
+	"CAP_SETUID":           capability.CAP_SETUID,
+	"CAP_SETPCAP":          capability.CAP_SETPCAP,
+	"CAP_LINUX_IMMUTABLE":  capability.CAP_LINUX_IMMUTABLE,
+	"CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE,
+	"CAP_NET_BROADCAST":    capability.CAP_NET_BROADCAST,
+	"CAP_NET_ADMIN":        capability.CAP_NET_ADMIN,
+	"CAP_NET_RAW":          capability.CAP_NET_RAW,
+	"CAP_IPC_LOCK":         capability.CAP_IPC_LOCK,
+	"CAP_IPC_OWNER":        capability.CAP_IPC_OWNER,
+	"CAP_SYS_MODULE":       capability.CAP_SYS_MODULE,
+	"CAP_SYS_RAWIO":        capability.CAP_SYS_RAWIO,
+	"CAP_SYS_CHROOT":       capability.CAP_SYS_CHROOT,
+	"CAP_SYS_PTRACE":       capability.CAP_SYS_PTRACE,
+	"CAP_SYS_PACCT":        capability.CAP_SYS_PACCT,
+	"CAP_SYS_ADMIN":        capability.CAP_SYS_ADMIN,
+	"CAP_SYS_BOOT":         capability.CAP_SYS_BOOT,
+	"CAP_SYS_NICE":         capability.CAP_SYS_NICE,
+	"CAP_SYS_RESOURCE":     capability.CAP_SYS_RESOURCE,
+	"CAP_SYS_TIME":         capability.CAP_SYS_TIME,
+	"CAP_SYS_TTY_CONFIG":   capability.CAP_SYS_TTY_CONFIG,
+	"CAP_MKNOD":            capability.CAP_MKNOD,
+	"CAP_LEASE":            capability.CAP_LEASE,
+	"CAP_AUDIT_WRITE":      capability.CAP_AUDIT_WRITE,
+	"CAP_AUDIT_CONTROL":    capability.CAP_AUDIT_CONTROL,
+	"CAP_SETFCAP":          capability.CAP_SETFCAP,
+	"CAP_MAC_OVERRIDE":     capability.CAP_MAC_OVERRIDE,
+	"CAP_MAC_ADMIN":        capability.CAP_MAC_ADMIN,
+	"CAP_SYSLOG":           capability.CAP_SYSLOG,
+	"CAP_WAKE_ALARM":       capability.CAP_WAKE_ALARM,
+	"CAP_BLOCK_SUSPEND":    capability.CAP_BLOCK_SUSPEND,
+	"CAP_AUDIT_READ":       capability.CAP_AUDIT_READ,
+}
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
new file mode 100644
index 000000000..a84067112
--- /dev/null
+++ b/runsc/cmd/capability_test.go
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
+	}
+}
+
+func checkProcessCaps(pid int, wantCaps *specs.LinuxCapabilities) error {
+	curCaps, err := capability.NewPid2(pid)
+	if err != nil {
+		return fmt.Errorf("capability.NewPid2(%d) failed: %v", pid, err)
+	}
+	if err := curCaps.Load(); err != nil {
+		return fmt.Errorf("unable to load capabilities: %v", err)
+	}
+	fmt.Printf("Capabilities (PID: %d): %v\n", pid, curCaps)
+
+	for _, c := range allCapTypes {
+		if err := checkCaps(c, curCaps, wantCaps); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func checkCaps(which capability.CapType, curCaps capability.Capabilities, wantCaps *specs.LinuxCapabilities) error {
+	wantNames := getCaps(which, wantCaps)
+	for name, c := range capFromName {
+		want := specutils.ContainsStr(wantNames, name)
+		got := curCaps.Get(which, c)
+		if want != got {
+			if want {
+				return fmt.Errorf("capability %v:%s should be set", which, name)
+			}
+			return fmt.Errorf("capability %v:%s should NOT be set", which, name)
+		}
+	}
+	return nil
+}
+
+func TestCapabilities(t *testing.T) {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	caps := []string{
+		"CAP_CHOWN",
+		"CAP_SYS_PTRACE", // ptrace is added due to the platform choice.
+	}
+	spec.Process.Capabilities = &specs.LinuxCapabilities{
+		Permitted:   caps,
+		Bounding:    caps,
+		Effective:   caps,
+		Inheritable: caps,
+	}
+
+	conf := testutil.TestConfig(t)
+
+	// Use --network=host to make sandbox use spec's capabilities.
+	conf.Network = boot.NetworkHost
+
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create and start the container.
+	args := container.Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	c, err := container.New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Check that sandbox and gofer have the proper capabilities.
+	if err := checkProcessCaps(c.Sandbox.Pid, spec.Process.Capabilities); err != nil {
+		t.Error(err)
+	}
+	if err := checkProcessCaps(c.GoferPid, goferCaps); err != nil {
+		t.Error(err)
+	}
+}
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+	specutils.MaybeRunAsRoot()
+	os.Exit(m.Run())
+}
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
new file mode 100644
index 000000000..8a29e521e
--- /dev/null
+++ b/runsc/cmd/checkpoint.go
@@ -0,0 +1,155 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// File containing the container's saved image/state within the given image-path's directory.
+const checkpointFileName = "checkpoint.img"
+
+// Checkpoint implements subcommands.Command for the "checkpoint" command.
+type Checkpoint struct {
+	imagePath    string
+	leaveRunning bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Checkpoint) Name() string {
+	return "checkpoint"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Checkpoint) Synopsis() string {
+	return "checkpoint current state of container (experimental)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Checkpoint) Usage() string {
+	return `checkpoint [flags] <container id> - save current state of container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Checkpoint) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.imagePath, "image-path", "", "directory path to saved container image")
+	f.BoolVar(&c.leaveRunning, "leave-running", false, "restart the container after checkpointing")
+
+	// Unimplemented flags necessary for compatibility with docker.
+	var wp string
+	f.StringVar(&wp, "work-path", "", "ignored")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	cont, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading container: %v", err)
+	}
+
+	if c.imagePath == "" {
+		Fatalf("image-path flag must be provided")
+	}
+
+	if err := os.MkdirAll(c.imagePath, 0755); err != nil {
+		Fatalf("making directories at path provided: %v", err)
+	}
+
+	fullImagePath := filepath.Join(c.imagePath, checkpointFileName)
+
+	// Create the image file and open for writing.
+	file, err := os.OpenFile(fullImagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+	if err != nil {
+		Fatalf("os.OpenFile(%q) failed: %v", fullImagePath, err)
+	}
+	defer file.Close()
+
+	if err := cont.Checkpoint(file); err != nil {
+		Fatalf("checkpoint failed: %v", err)
+	}
+
+	if !c.leaveRunning {
+		return subcommands.ExitSuccess
+	}
+
+	// TODO(b/110843694): Make it possible to restore into same container.
+	// For now, we can fake it by destroying the container and making a
+	// new container with the same ID. This hack does not work with docker
+	// which uses the container pid to ensure that the restore-container is
+	// actually the same as the checkpoint-container. By restoring into
+	// the same container, we will solve the docker incompatibility.
+
+	// Restore into new container with same ID.
+	bundleDir := cont.BundleDir
+	if bundleDir == "" {
+		Fatalf("setting bundleDir")
+	}
+
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		Fatalf("reading spec: %v", err)
+	}
+
+	specutils.LogSpec(spec)
+
+	if cont.ConsoleSocket != "" {
+		log.Warningf("ignoring console socket since it cannot be restored")
+	}
+
+	if err := cont.Destroy(); err != nil {
+		Fatalf("destroying container: %v", err)
+	}
+
+	contArgs := container.Args{
+		ID:        id,
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	cont, err = container.New(conf, contArgs)
+	if err != nil {
+		Fatalf("restoring container: %v", err)
+	}
+	defer cont.Destroy()
+
+	if err := cont.Restore(spec, conf, fullImagePath); err != nil {
+		Fatalf("starting container: %v", err)
+	}
+
+	ws, err := cont.Wait()
+	*waitStatus = ws
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go
new file mode 100644
index 000000000..189244765
--- /dev/null
+++ b/runsc/cmd/chroot.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// mountInChroot creates the destination mount point in the given chroot and
+// mounts the source.
+func mountInChroot(chroot, src, dst, typ string, flags uint32) error {
+	chrootDst := filepath.Join(chroot, dst)
+	log.Infof("Mounting %q at %q", src, chrootDst)
+
+	if err := specutils.Mount(src, chrootDst, typ, flags); err != nil {
+		return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err)
+	}
+	return nil
+}
+
+func pivotRoot(root string) error {
+	if err := os.Chdir(root); err != nil {
+		return fmt.Errorf("error changing working directory: %v", err)
+	}
+	// pivot_root(new_root, put_old) moves the root filesystem (old_root)
+	// of the calling process to the directory put_old and makes new_root
+	// the new root filesystem of the calling process.
+	//
+	// pivot_root(".", ".") makes a mount of the working directory the new
+	// root filesystem, so it will be moved in "/" and then the old_root
+	// will be moved to "/" too. The parent mount of the old_root will be
+	// new_root, so after umounting the old_root, we will see only
+	// the new_root in "/".
+	if err := syscall.PivotRoot(".", "."); err != nil {
+		return fmt.Errorf("pivot_root failed, make sure that the root mount has a parent: %v", err)
+	}
+
+	if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil {
+		return fmt.Errorf("error umounting the old root file system: %v", err)
+	}
+	return nil
+}
+
+// setUpChroot creates an empty directory with runsc mounted at /runsc and proc
+// mounted at /proc.
+func setUpChroot(pidns bool) error {
+	// We are a new mount namespace, so we can use /tmp as a directory to
+	// construct a new root.
+	chroot := os.TempDir()
+
+	log.Infof("Setting up sandbox chroot in %q", chroot)
+
+	// Convert all shared mounts into slave to be sure that nothing will be
+	// propagated outside of our namespace.
+	if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
+		return fmt.Errorf("error converting mounts: %v", err)
+	}
+
+	if err := syscall.Mount("runsc-root", chroot, "tmpfs", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC, ""); err != nil {
+		return fmt.Errorf("error mounting tmpfs in choot: %v", err)
+	}
+
+	if pidns {
+		flags := uint32(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC | syscall.MS_RDONLY)
+		if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil {
+			return fmt.Errorf("error mounting proc in chroot: %v", err)
+		}
+	} else {
+		if err := mountInChroot(chroot, "/proc", "/proc", "bind", syscall.MS_BIND|syscall.MS_RDONLY|syscall.MS_REC); err != nil {
+			return fmt.Errorf("error mounting proc in chroot: %v", err)
+		}
+	}
+
+	if err := syscall.Mount("", chroot, "", syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_BIND, ""); err != nil {
+		return fmt.Errorf("error remounting chroot in read-only: %v", err)
+	}
+
+	return pivotRoot(chroot)
+}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
new file mode 100644
index 000000000..f1a4887ef
--- /dev/null
+++ b/runsc/cmd/cmd.go
@@ -0,0 +1,98 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cmd holds implementations of the runsc commands.
+package cmd
+
+import (
+	"fmt"
+	"runtime"
+	"strconv"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// intFlags can be used with int flags that appear multiple times.
+type intFlags []int
+
+// String implements flag.Value.
+func (i *intFlags) String() string {
+	return fmt.Sprintf("%v", *i)
+}
+
+// Get implements flag.Value.
+func (i *intFlags) Get() interface{} {
+	return i
+}
+
+// GetArray returns array of FDs.
+func (i *intFlags) GetArray() []int {
+	return *i
+}
+
+// Set implements flag.Value.
+func (i *intFlags) Set(s string) error {
+	fd, err := strconv.Atoi(s)
+	if err != nil {
+		return fmt.Errorf("invalid flag value: %v", err)
+	}
+	if fd < 0 {
+		return fmt.Errorf("flag value must be greater than 0: %d", fd)
+	}
+	*i = append(*i, fd)
+	return nil
+}
+
+// setCapsAndCallSelf sets capabilities to the current thread and then execve's
+// itself again with the arguments specified in 'args' to restart the process
+// with the desired capabilities.
+func setCapsAndCallSelf(args []string, caps *specs.LinuxCapabilities) error {
+	// Keep thread locked while capabilities are changed.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if err := applyCaps(caps); err != nil {
+		return fmt.Errorf("applyCaps() failed: %v", err)
+	}
+	binPath := specutils.ExePath
+
+	log.Infof("Execve %q again, bye!", binPath)
+	err := syscall.Exec(binPath, args, []string{})
+	return fmt.Errorf("error executing %s: %v", binPath, err)
+}
+
+// callSelfAsNobody sets UID and GID to nobody and then execve's itself again.
+func callSelfAsNobody(args []string) error {
+	// Keep thread locked while user/group are changed.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	const nobody = 65534
+
+	if _, _, err := syscall.RawSyscall(syscall.SYS_SETGID, uintptr(nobody), 0, 0); err != 0 {
+		return fmt.Errorf("error setting uid: %v", err)
+	}
+	if _, _, err := syscall.RawSyscall(syscall.SYS_SETUID, uintptr(nobody), 0, 0); err != 0 {
+		return fmt.Errorf("error setting gid: %v", err)
+	}
+
+	binPath := specutils.ExePath
+
+	log.Infof("Execve %q again, bye!", binPath)
+	err := syscall.Exec(binPath, args, []string{})
+	return fmt.Errorf("error executing %s: %v", binPath, err)
+}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
new file mode 100644
index 000000000..910e97577
--- /dev/null
+++ b/runsc/cmd/create.go
@@ -0,0 +1,115 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// Create implements subcommands.Command for the "create" command.
+type Create struct {
+	// bundleDir is the path to the bundle directory (defaults to the
+	// current working directory).
+	bundleDir string
+
+	// pidFile is the filename that the sandbox pid will be written to.
+	// This file should only be created once the container process inside
+	// the sandbox is ready to use.
+	pidFile string
+
+	// consoleSocket is the path to an AF_UNIX socket which will receive a
+	// file descriptor referencing the master end of the console's
+	// pseudoterminal.  This is ignored unless spec.Process.Terminal is
+	// true.
+	consoleSocket string
+
+	// userLog is the path to send user-visible logs to. This log is different
+	// from debug logs. The former is meant to be consumed by the users and should
+	// contain only information that is relevant to the person running the
+	// container, e.g. unsuported syscalls, while the later is more verbose and
+	// consumed by developers.
+	userLog string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Create) Name() string {
+	return "create"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Create) Synopsis() string {
+	return "create a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Create) Usage() string {
+	return `create [flags] <container id> - create a secure container
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Create) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+	f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+	f.StringVar(&c.pidFile, "pid-file", "", "filename that the container pid will be written to")
+	f.StringVar(&c.userLog, "user-log", "", "filename to send user-visible logs to. Empty means no logging.")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", c.Name())
+	}
+
+	bundleDir := c.bundleDir
+	if bundleDir == "" {
+		bundleDir = getwdOrDie()
+	}
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		return Errorf("reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	// Create the container. A new sandbox will be created for the
+	// container unless the metadata specifies that it should be run in an
+	// existing container.
+	contArgs := container.Args{
+		ID:            id,
+		Spec:          spec,
+		BundleDir:     bundleDir,
+		ConsoleSocket: c.consoleSocket,
+		PIDFile:       c.pidFile,
+		UserLog:       c.userLog,
+	}
+	if _, err := container.New(conf, contArgs); err != nil {
+		return Errorf("creating container: %v", err)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
new file mode 100644
index 000000000..b5de2588b
--- /dev/null
+++ b/runsc/cmd/debug.go
@@ -0,0 +1,304 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"os"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Debug implements subcommands.Command for the "debug" command.
+type Debug struct {
+	pid              int
+	stacks           bool
+	signal           int
+	profileHeap      string
+	profileCPU       string
+	profileGoroutine string
+	profileBlock     string
+	profileMutex     string
+	trace            string
+	strace           string
+	logLevel         string
+	logPackets       string
+	duration         time.Duration
+	ps               bool
+}
+
+// Name implements subcommands.Command.
+func (*Debug) Name() string {
+	return "debug"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Debug) Synopsis() string {
+	return "shows a variety of debug information"
+}
+
+// Usage implements subcommands.Command.
+func (*Debug) Usage() string {
+	return `debug [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.
+func (d *Debug) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set")
+	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
+	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
+	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
+	f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.")
+	f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
+	f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
+	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
+	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
+	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
+	f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`)
+	f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).")
+	f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.")
+	f.BoolVar(&d.ps, "ps", false, "lists processes")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	var c *container.Container
+	conf := args[0].(*boot.Config)
+
+	if d.pid == 0 {
+		// No pid, container ID must have been provided.
+		if f.NArg() != 1 {
+			f.Usage()
+			return subcommands.ExitUsageError
+		}
+		var err error
+		c, err = container.Load(conf.RootDir, f.Arg(0))
+		if err != nil {
+			return Errorf("loading container %q: %v", f.Arg(0), err)
+		}
+	} else {
+		if f.NArg() != 0 {
+			f.Usage()
+			return subcommands.ExitUsageError
+		}
+		// Go over all sandboxes and find the one that matches PID.
+		ids, err := container.List(conf.RootDir)
+		if err != nil {
+			return Errorf("listing containers: %v", err)
+		}
+		for _, id := range ids {
+			candidate, err := container.Load(conf.RootDir, id)
+			if err != nil {
+				return Errorf("loading container %q: %v", id, err)
+			}
+			if candidate.SandboxPid() == d.pid {
+				c = candidate
+				break
+			}
+		}
+		if c == nil {
+			return Errorf("container with PID %d not found", d.pid)
+		}
+	}
+
+	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+		return Errorf("container sandbox is not running")
+	}
+	log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
+
+	if d.signal > 0 {
+		log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid)
+		if err := syscall.Kill(c.Sandbox.Pid, syscall.Signal(d.signal)); err != nil {
+			return Errorf("failed to send signal %d to processs %d", d.signal, c.Sandbox.Pid)
+		}
+	}
+	if d.stacks {
+		log.Infof("Retrieving sandbox stacks")
+		stacks, err := c.Sandbox.Stacks()
+		if err != nil {
+			return Errorf("retrieving stacks: %v", err)
+		}
+		log.Infof("     *** Stack dump ***\n%s", stacks)
+	}
+	if d.profileHeap != "" {
+		f, err := os.Create(d.profileHeap)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.HeapProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Heap profile written to %q", d.profileHeap)
+	}
+	if d.profileGoroutine != "" {
+		f, err := os.Create(d.profileGoroutine)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.GoroutineProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Goroutine profile written to %q", d.profileGoroutine)
+	}
+	if d.profileBlock != "" {
+		f, err := os.Create(d.profileBlock)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.BlockProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Block profile written to %q", d.profileBlock)
+	}
+	if d.profileMutex != "" {
+		f, err := os.Create(d.profileMutex)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer f.Close()
+
+		if err := c.Sandbox.MutexProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Mutex profile written to %q", d.profileMutex)
+	}
+
+	delay := false
+	if d.profileCPU != "" {
+		delay = true
+		f, err := os.Create(d.profileCPU)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer func() {
+			f.Close()
+			if err := c.Sandbox.StopCPUProfile(); err != nil {
+				Fatalf(err.Error())
+			}
+			log.Infof("CPU profile written to %q", d.profileCPU)
+		}()
+		if err := c.Sandbox.StartCPUProfile(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU)
+	}
+	if d.trace != "" {
+		delay = true
+		f, err := os.Create(d.trace)
+		if err != nil {
+			return Errorf(err.Error())
+		}
+		defer func() {
+			f.Close()
+			if err := c.Sandbox.StopTrace(); err != nil {
+				Fatalf(err.Error())
+			}
+			log.Infof("Trace written to %q", d.trace)
+		}()
+		if err := c.Sandbox.StartTrace(f); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace)
+	}
+
+	if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 {
+		args := control.LoggingArgs{}
+		switch strings.ToLower(d.strace) {
+		case "":
+			// strace not set, nothing to do here.
+
+		case "off":
+			log.Infof("Disabling strace")
+			args.SetStrace = true
+
+		case "all":
+			log.Infof("Enabling all straces")
+			args.SetStrace = true
+			args.EnableStrace = true
+
+		default:
+			log.Infof("Enabling strace for syscalls: %s", d.strace)
+			args.SetStrace = true
+			args.EnableStrace = true
+			args.StraceWhitelist = strings.Split(d.strace, ",")
+		}
+
+		if len(d.logLevel) != 0 {
+			args.SetLevel = true
+			switch strings.ToLower(d.logLevel) {
+			case "warning", "0":
+				args.Level = log.Warning
+			case "info", "1":
+				args.Level = log.Info
+			case "debug", "2":
+				args.Level = log.Debug
+			default:
+				return Errorf("invalid log level %q", d.logLevel)
+			}
+			log.Infof("Setting log level %v", args.Level)
+		}
+
+		if len(d.logPackets) != 0 {
+			args.SetLogPackets = true
+			lp, err := strconv.ParseBool(d.logPackets)
+			if err != nil {
+				return Errorf("invalid value for log_packets %q", d.logPackets)
+			}
+			args.LogPackets = lp
+			if args.LogPackets {
+				log.Infof("Enabling packet logging")
+			} else {
+				log.Infof("Disabling packet logging")
+			}
+		}
+
+		if err := c.Sandbox.ChangeLogging(args); err != nil {
+			return Errorf(err.Error())
+		}
+		log.Infof("Logging options changed")
+	}
+	if d.ps {
+		pList, err := c.Processes()
+		if err != nil {
+			Fatalf("getting processes for container: %v", err)
+		}
+		o, err := control.ProcessListToJSON(pList)
+		if err != nil {
+			Fatalf("generating JSON: %v", err)
+		}
+		log.Infof(o)
+	}
+
+	if delay {
+		time.Sleep(d.duration)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
new file mode 100644
index 000000000..0e4863f50
--- /dev/null
+++ b/runsc/cmd/delete.go
@@ -0,0 +1,87 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Delete implements subcommands.Command for the "delete" command.
+type Delete struct {
+	// force indicates that the container should be terminated if running.
+	force bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Delete) Name() string {
+	return "delete"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Delete) Synopsis() string {
+	return "delete resources held by a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Delete) Usage() string {
+	return `delete [flags] <container ids>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (d *Delete) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&d.force, "force", false, "terminate container if running")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() == 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	conf := args[0].(*boot.Config)
+	if err := d.execute(f.Args(), conf); err != nil {
+		Fatalf("%v", err)
+	}
+	return subcommands.ExitSuccess
+}
+
+func (d *Delete) execute(ids []string, conf *boot.Config) error {
+	for _, id := range ids {
+		c, err := container.Load(conf.RootDir, id)
+		if err != nil {
+			if os.IsNotExist(err) && d.force {
+				log.Warningf("couldn't find container %q: %v", id, err)
+				return nil
+			}
+			return fmt.Errorf("loading container %q: %v", id, err)
+		}
+		if !d.force && c.Status != container.Created && c.Status != container.Stopped {
+			return fmt.Errorf("cannot delete container that is not stopped without --force flag")
+		}
+		if err := c.Destroy(); err != nil {
+			return fmt.Errorf("destroying container: %v", err)
+		}
+	}
+	return nil
+}
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
new file mode 100644
index 000000000..cb59516a3
--- /dev/null
+++ b/runsc/cmd/delete_test.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"io/ioutil"
+	"testing"
+
+	"gvisor.dev/gvisor/runsc/boot"
+)
+
+func TestNotFound(t *testing.T) {
+	ids := []string{"123"}
+	dir, err := ioutil.TempDir("", "metadata")
+	if err != nil {
+		t.Fatalf("error creating dir: %v", err)
+	}
+	conf := &boot.Config{RootDir: dir}
+
+	d := Delete{}
+	if err := d.execute(ids, conf); err == nil {
+		t.Error("Deleting non-existent container should have failed")
+	}
+
+	d = Delete{force: true}
+	if err := d.execute(ids, conf); err != nil {
+		t.Errorf("Deleting non-existent container with --force should NOT have failed: %v", err)
+	}
+}
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
new file mode 100644
index 000000000..7d1310c96
--- /dev/null
+++ b/runsc/cmd/do.go
@@ -0,0 +1,385 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"syscall"
+
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// Do implements subcommands.Command for the "do" command. It sets up a simple
+// sandbox and executes the command inside it. See Usage() for more details.
+type Do struct {
+	root  string
+	cwd   string
+	ip    string
+	quiet bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Do) Name() string {
+	return "do"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Do) Synopsis() string {
+	return "Simplistic way to execute a command inside the sandbox. It's to be used for testing only."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Do) Usage() string {
+	return `do [flags] <cmd> - runs a command.
+
+This command starts a sandbox with host filesystem mounted inside as readonly,
+with a writable tmpfs overlay on top of it. The given command is executed inside
+the sandbox. It's to be used to quickly test applications without having to
+install or run docker. It doesn't give nearly as many options and it's to be
+used for testing only.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Do) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
+	f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory")
+	f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox")
+	f.BoolVar(&c.quiet, "quiet", false, "suppress runsc messages to stdout. Application output is still sent to stdout and stderr")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if len(f.Args()) == 0 {
+		c.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	if conf.Rootless {
+		if err := specutils.MaybeRunAsRoot(); err != nil {
+			return Errorf("Error executing inside namespace: %v", err)
+		}
+		// Execution will continue here if no more capabilities are needed...
+	}
+
+	hostname, err := os.Hostname()
+	if err != nil {
+		return Errorf("Error to retrieve hostname: %v", err)
+	}
+
+	// Map the entire host file system, but make it readonly with a writable
+	// overlay on top (ignore --overlay option).
+	conf.Overlay = true
+	absRoot, err := resolvePath(c.root)
+	if err != nil {
+		return Errorf("Error resolving root: %v", err)
+	}
+	absCwd, err := resolvePath(c.cwd)
+	if err != nil {
+		return Errorf("Error resolving current directory: %v", err)
+	}
+
+	spec := &specs.Spec{
+		Root: &specs.Root{
+			Path: absRoot,
+		},
+		Process: &specs.Process{
+			Cwd:          absCwd,
+			Args:         f.Args(),
+			Env:          os.Environ(),
+			Capabilities: specutils.AllCapabilities(),
+		},
+		Hostname: hostname,
+	}
+
+	specutils.LogSpec(spec)
+
+	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
+	if conf.Network == boot.NetworkNone {
+		netns := specs.LinuxNamespace{
+			Type: specs.NetworkNamespace,
+		}
+		if spec.Linux != nil {
+			panic("spec.Linux is not nil")
+		}
+		spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}}
+
+	} else if conf.Rootless {
+		if conf.Network == boot.NetworkSandbox {
+			c.notifyUser("*** Warning: using host network due to --rootless ***")
+			conf.Network = boot.NetworkHost
+		}
+
+	} else {
+		clean, err := c.setupNet(cid, spec)
+		if err != nil {
+			return Errorf("Error setting up network: %v", err)
+		}
+		defer clean()
+	}
+
+	out, err := json.Marshal(spec)
+	if err != nil {
+		return Errorf("Error to marshal spec: %v", err)
+	}
+	tmpDir, err := ioutil.TempDir("", "runsc-do")
+	if err != nil {
+		return Errorf("Error to create tmp dir: %v", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	log.Infof("Changing configuration RootDir to %q", tmpDir)
+	conf.RootDir = tmpDir
+
+	cfgPath := filepath.Join(tmpDir, "config.json")
+	if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil {
+		return Errorf("Error write spec: %v", err)
+	}
+
+	containerArgs := container.Args{
+		ID:        cid,
+		Spec:      spec,
+		BundleDir: tmpDir,
+		Attached:  true,
+	}
+	ct, err := container.New(conf, containerArgs)
+	if err != nil {
+		return Errorf("creating container: %v", err)
+	}
+	defer ct.Destroy()
+
+	if err := ct.Start(conf); err != nil {
+		return Errorf("starting container: %v", err)
+	}
+
+	// Forward signals to init in the container. Thus if we get SIGINT from
+	// ^C, the container gracefully exit, and we can clean up.
+	//
+	// N.B. There is a still a window before this where a signal may kill
+	// this process, skipping cleanup.
+	stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */)
+	defer stopForwarding()
+
+	ws, err := ct.Wait()
+	if err != nil {
+		return Errorf("waiting for container: %v", err)
+	}
+
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
+
+func (c *Do) notifyUser(format string, v ...interface{}) {
+	if !c.quiet {
+		fmt.Printf(format+"\n", v...)
+	}
+	log.Warningf(format, v...)
+}
+
+func resolvePath(path string) (string, error) {
+	var err error
+	path, err = filepath.Abs(path)
+	if err != nil {
+		return "", fmt.Errorf("resolving %q: %v", path, err)
+	}
+	path = filepath.Clean(path)
+	if err := syscall.Access(path, 0); err != nil {
+		return "", fmt.Errorf("unable to access %q: %v", path, err)
+	}
+	return path, nil
+}
+
+func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) {
+	dev, err := defaultDevice()
+	if err != nil {
+		return nil, err
+	}
+	peerIP, err := calculatePeerIP(c.ip)
+	if err != nil {
+		return nil, err
+	}
+	veth, peer := deviceNames(cid)
+
+	cmds := []string{
+		fmt.Sprintf("ip link add %s type veth peer name %s", veth, peer),
+
+		// Setup device outside the namespace.
+		fmt.Sprintf("ip addr add %s/24 dev %s", peerIP, peer),
+		fmt.Sprintf("ip link set %s up", peer),
+
+		// Setup device inside the namespace.
+		fmt.Sprintf("ip netns add %s", cid),
+		fmt.Sprintf("ip link set %s netns %s", veth, cid),
+		fmt.Sprintf("ip netns exec %s ip addr add %s/24 dev %s", cid, c.ip, veth),
+		fmt.Sprintf("ip netns exec %s ip link set %s up", cid, veth),
+		fmt.Sprintf("ip netns exec %s ip link set lo up", cid),
+		fmt.Sprintf("ip netns exec %s ip route add default via %s", cid, peerIP),
+
+		// Enable network access.
+		"sysctl -w net.ipv4.ip_forward=1",
+		fmt.Sprintf("iptables -t nat -A POSTROUTING -s %s -o %s -j MASQUERADE", c.ip, dev),
+		fmt.Sprintf("iptables -A FORWARD -i %s -o %s -j ACCEPT", dev, peer),
+		fmt.Sprintf("iptables -A FORWARD -o %s -i %s -j ACCEPT", dev, peer),
+	}
+
+	for _, cmd := range cmds {
+		log.Debugf("Run %q", cmd)
+		args := strings.Split(cmd, " ")
+		cmd := exec.Command(args[0], args[1:]...)
+		if err := cmd.Run(); err != nil {
+			c.cleanupNet(cid, dev, "", "", "")
+			return nil, fmt.Errorf("failed to run %q: %v", cmd, err)
+		}
+	}
+
+	resolvPath, err := makeFile("/etc/resolv.conf", "nameserver 8.8.8.8\n", spec)
+	if err != nil {
+		c.cleanupNet(cid, dev, "", "", "")
+		return nil, err
+	}
+	hostnamePath, err := makeFile("/etc/hostname", cid+"\n", spec)
+	if err != nil {
+		c.cleanupNet(cid, dev, resolvPath, "", "")
+		return nil, err
+	}
+	hosts := fmt.Sprintf("127.0.0.1\tlocalhost\n%s\t%s\n", c.ip, cid)
+	hostsPath, err := makeFile("/etc/hosts", hosts, spec)
+	if err != nil {
+		c.cleanupNet(cid, dev, resolvPath, hostnamePath, "")
+		return nil, err
+	}
+
+	if spec.Linux == nil {
+		spec.Linux = &specs.Linux{}
+	}
+	netns := specs.LinuxNamespace{
+		Type: specs.NetworkNamespace,
+		Path: filepath.Join("/var/run/netns", cid),
+	}
+	spec.Linux.Namespaces = append(spec.Linux.Namespaces, netns)
+
+	return func() { c.cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath) }, nil
+}
+
+// cleanupNet tries to cleanup the network setup in setupNet.
+//
+// It may be called when setupNet is only partially complete, in which case it
+// will cleanup as much as possible, logging warnings for the rest.
+//
+// Unfortunately none of this can be automatically cleaned up on process exit,
+// we must do so explicitly.
+func (c *Do) cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath string) {
+	_, peer := deviceNames(cid)
+
+	cmds := []string{
+		fmt.Sprintf("ip link delete %s", peer),
+		fmt.Sprintf("ip netns delete %s", cid),
+	}
+
+	for _, cmd := range cmds {
+		log.Debugf("Run %q", cmd)
+		args := strings.Split(cmd, " ")
+		c := exec.Command(args[0], args[1:]...)
+		if err := c.Run(); err != nil {
+			log.Warningf("Failed to run %q: %v", cmd, err)
+		}
+	}
+
+	tryRemove(resolvPath)
+	tryRemove(hostnamePath)
+	tryRemove(hostsPath)
+}
+
+func deviceNames(cid string) (string, string) {
+	// Device name is limited to 15 letters.
+	return "ve-" + cid, "vp-" + cid
+
+}
+
+func defaultDevice() (string, error) {
+	out, err := exec.Command("ip", "route", "list", "default").CombinedOutput()
+	if err != nil {
+		return "", err
+	}
+	parts := strings.Split(string(out), " ")
+	if len(parts) < 5 {
+		return "", fmt.Errorf("malformed %q output: %q", "ip route list default", string(out))
+	}
+	return parts[4], nil
+}
+
+func makeFile(dest, content string, spec *specs.Spec) (string, error) {
+	tmpFile, err := ioutil.TempFile("", filepath.Base(dest))
+	if err != nil {
+		return "", err
+	}
+	if _, err := tmpFile.WriteString(content); err != nil {
+		if err := os.Remove(tmpFile.Name()); err != nil {
+			log.Warningf("Failed to remove %q: %v", tmpFile, err)
+		}
+		return "", err
+	}
+	spec.Mounts = append(spec.Mounts, specs.Mount{
+		Source:      tmpFile.Name(),
+		Destination: dest,
+		Type:        "bind",
+		Options:     []string{"ro"},
+	})
+	return tmpFile.Name(), nil
+}
+
+func tryRemove(path string) {
+	if path == "" {
+		return
+	}
+
+	if err := os.Remove(path); err != nil {
+		log.Warningf("Failed to remove %q: %v", path, err)
+	}
+}
+
+func calculatePeerIP(ip string) (string, error) {
+	parts := strings.Split(ip, ".")
+	if len(parts) != 4 {
+		return "", fmt.Errorf("invalid IP format %q", ip)
+	}
+	n, err := strconv.Atoi(parts[3])
+	if err != nil {
+		return "", fmt.Errorf("invalid IP format %q: %v", ip, err)
+	}
+	n++
+	if n > 255 {
+		n = 1
+	}
+	return fmt.Sprintf("%s.%s.%s.%d", parts[0], parts[1], parts[2], n), nil
+}
diff --git a/runsc/cmd/error.go b/runsc/cmd/error.go
new file mode 100644
index 000000000..3585b5448
--- /dev/null
+++ b/runsc/cmd/error.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// ErrorLogger is where error messages should be written to. These messages are
+// consumed by containerd and show up to users of command line tools,
+// like docker/kubectl.
+var ErrorLogger io.Writer
+
+type jsonError struct {
+	Msg   string    `json:"msg"`
+	Level string    `json:"level"`
+	Time  time.Time `json:"time"`
+}
+
+// Errorf logs error to containerd log (--log), to stderr, and debug logs. It
+// returns subcommands.ExitFailure for convenience with subcommand.Execute()
+// methods:
+//    return Errorf("Danger! Danger!")
+//
+func Errorf(format string, args ...interface{}) subcommands.ExitStatus {
+	// If runsc is being invoked by docker or cri-o, then we might not have
+	// access to stderr, so we log a serious-looking warning in addition to
+	// writing to stderr.
+	log.Warningf("FATAL ERROR: "+format, args...)
+	fmt.Fprintf(os.Stderr, format+"\n", args...)
+
+	j := jsonError{
+		Msg:   fmt.Sprintf(format, args...),
+		Level: "error",
+		Time:  time.Now(),
+	}
+	b, err := json.Marshal(j)
+	if err != nil {
+		panic(err)
+	}
+	if ErrorLogger != nil {
+		ErrorLogger.Write(b)
+	}
+
+	return subcommands.ExitFailure
+}
+
+// Fatalf logs the same way as Errorf() does, plus *exits* the process.
+func Fatalf(format string, args ...interface{}) {
+	Errorf(format, args...)
+	// Return an error that is unlikely to be used by the application.
+	os.Exit(128)
+}
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
new file mode 100644
index 000000000..51f6a98ed
--- /dev/null
+++ b/runsc/cmd/events.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"os"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Events implements subcommands.Command for the "events" command.
+type Events struct {
+	// The interval between stats reporting.
+	intervalSec int
+	// If true, events will print a single group of stats and exit.
+	stats bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Events) Name() string {
+	return "events"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Events) Synopsis() string {
+	return "display container events such as OOM notifications, cpu, memory, and IO usage statistics"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Events) Usage() string {
+	return `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+The events command displays information about the container. By default the
+information is displayed once every 5 seconds.
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (evs *Events) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds")
+	f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	c, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading sandbox: %v", err)
+	}
+
+	// Repeatedly get stats from the container.
+	for {
+		// Get the event and print it as JSON.
+		ev, err := c.Event()
+		if err != nil {
+			log.Warningf("Error getting events for container: %v", err)
+		}
+		// err must be preserved because it is used below when breaking
+		// out of the loop.
+		b, err := json.Marshal(ev)
+		if err != nil {
+			log.Warningf("Error while marshalling event %v: %v", ev, err)
+		} else {
+			os.Stdout.Write(b)
+		}
+
+		// If we're only running once, break. If we're only running
+		// once and there was an error, the command failed.
+		if evs.stats {
+			if err != nil {
+				return subcommands.ExitFailure
+			}
+			break
+		}
+
+		time.Sleep(time.Duration(evs.intervalSec) * time.Second)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
new file mode 100644
index 000000000..d9a94903e
--- /dev/null
+++ b/runsc/cmd/exec.go
@@ -0,0 +1,481 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/console"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// Exec implements subcommands.Command for the "exec" command.
+type Exec struct {
+	cwd string
+	env stringSlice
+	// user contains the UID and GID with which to run the new process.
+	user            user
+	extraKGIDs      stringSlice
+	caps            stringSlice
+	detach          bool
+	processPath     string
+	pidFile         string
+	internalPidFile string
+
+	// consoleSocket is the path to an AF_UNIX socket which will receive a
+	// file descriptor referencing the master end of the console's
+	// pseudoterminal.
+	consoleSocket string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Exec) Name() string {
+	return "exec"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Exec) Synopsis() string {
+	return "execute new process inside the container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Exec) Usage() string {
+	return `exec [command options] <container-id> <command> [command options] || --process process.json <container-id>
+
+
+Where "<container-id>" is the name for the instance of the container and
+"<command>" is the command to be executed in the container.
+"<command>" can't be empty unless a "-process" flag provided.
+
+EXAMPLE:
+If the container is configured to run /bin/ps the following will
+output a list of processes running in the container:
+
+       # runc exec <container-id> ps
+
+OPTIONS:
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ex *Exec) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&ex.cwd, "cwd", "", "current working directory")
+	f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')")
+	f.Var(&ex.user, "user", "UID (format: <uid>[:<gid>])")
+	f.Var(&ex.extraKGIDs, "additional-gids", "additional gids")
+	f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process")
+	f.BoolVar(&ex.detach, "detach", false, "detach from the container's process")
+	f.StringVar(&ex.processPath, "process", "", "path to the process.json")
+	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
+	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
+	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
+}
+
+// Execute implements subcommands.Command.Execute. It starts a process in an
+// already created container.
+func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	conf := args[0].(*boot.Config)
+	e, id, err := ex.parseArgs(f, conf.EnableRaw)
+	if err != nil {
+		Fatalf("parsing process spec: %v", err)
+	}
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	c, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading sandbox: %v", err)
+	}
+
+	log.Debugf("Exec arguments: %+v", e)
+	log.Debugf("Exec capablities: %+v", e.Capabilities)
+
+	// Replace empty settings with defaults from container.
+	if e.WorkingDirectory == "" {
+		e.WorkingDirectory = c.Spec.Process.Cwd
+	}
+	if e.Envv == nil {
+		e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env)
+		if err != nil {
+			Fatalf("getting environment variables: %v", err)
+		}
+	}
+
+	if e.Capabilities == nil {
+		e.Capabilities, err = specutils.Capabilities(conf.EnableRaw, c.Spec.Process.Capabilities)
+		if err != nil {
+			Fatalf("creating capabilities: %v", err)
+		}
+		log.Infof("Using exec capabilities from container: %+v", e.Capabilities)
+	}
+
+	// containerd expects an actual process to represent the container being
+	// executed. If detach was specified, starts a child in non-detach mode,
+	// write the child's PID to the pid file. So when the container returns, the
+	// child process will also return and signal containerd.
+	if ex.detach {
+		return ex.execChildAndWait(waitStatus)
+	}
+	return ex.exec(c, e, waitStatus)
+}
+
+func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
+	// Start the new process and get it pid.
+	pid, err := c.Execute(e)
+	if err != nil {
+		return Errorf("executing processes for container: %v", err)
+	}
+
+	if e.StdioIsPty {
+		// Forward signals sent to this process to the foreground
+		// process in the sandbox.
+		stopForwarding := c.ForwardSignals(pid, true /* fgProcess */)
+		defer stopForwarding()
+	}
+
+	// Write the sandbox-internal pid if required.
+	if ex.internalPidFile != "" {
+		pidStr := []byte(strconv.Itoa(int(pid)))
+		if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
+			return Errorf("writing internal pid file %q: %v", ex.internalPidFile, err)
+		}
+	}
+
+	// Generate the pid file after the internal pid file is generated, so that
+	// users can safely assume that the internal pid file is ready after
+	// `runsc exec -d` returns.
+	if ex.pidFile != "" {
+		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
+			return Errorf("writing pid file: %v", err)
+		}
+	}
+
+	// Wait for the process to exit.
+	ws, err := c.WaitPID(pid)
+	if err != nil {
+		return Errorf("waiting on pid %d: %v", pid, err)
+	}
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
+
+func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
+	var args []string
+	for _, a := range os.Args[1:] {
+		if !strings.Contains(a, "detach") {
+			args = append(args, a)
+		}
+	}
+
+	// The command needs to write a pid file so that execChildAndWait can tell
+	// when it has started. If no pid-file was provided, we should use a
+	// filename in a temp directory.
+	pidFile := ex.pidFile
+	if pidFile == "" {
+		tmpDir, err := ioutil.TempDir("", "exec-pid-")
+		if err != nil {
+			Fatalf("creating TempDir: %v", err)
+		}
+		defer os.RemoveAll(tmpDir)
+		pidFile = filepath.Join(tmpDir, "pid")
+		args = append(args, "--pid-file="+pidFile)
+	}
+
+	cmd := exec.Command(specutils.ExePath, args...)
+	cmd.Args[0] = "runsc-exec"
+
+	// Exec stdio defaults to current process stdio.
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	// If the console control socket file is provided, then create a new
+	// pty master/slave pair and set the TTY on the sandbox process.
+	if ex.consoleSocket != "" {
+		// Create a new TTY pair and send the master on the provided socket.
+		tty, err := console.NewWithSocket(ex.consoleSocket)
+		if err != nil {
+			Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err)
+		}
+		defer tty.Close()
+
+		// Set stdio to the new TTY slave.
+		cmd.Stdin = tty
+		cmd.Stdout = tty
+		cmd.Stderr = tty
+		cmd.SysProcAttr = &syscall.SysProcAttr{
+			Setsid:  true,
+			Setctty: true,
+			// The Ctty FD must be the FD in the child process's FD
+			// table. Since we set cmd.Stdin/Stdout/Stderr to the
+			// tty FD, we can use any of 0, 1, or 2 here.
+			// See https://github.com/golang/go/issues/29458.
+			Ctty: 0,
+		}
+	}
+
+	if err := cmd.Start(); err != nil {
+		Fatalf("failure to start child exec process, err: %v", err)
+	}
+
+	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args)
+
+	// Wait for PID file to ensure that child process has started. Otherwise,
+	// '--process' file is deleted as soon as this process returns and the child
+	// may fail to read it.
+	ready := func() (bool, error) {
+		pidb, err := ioutil.ReadFile(pidFile)
+		if err == nil {
+			// File appeared, check whether pid is fully written.
+			pid, err := strconv.Atoi(string(pidb))
+			if err != nil {
+				return false, nil
+			}
+			return pid == cmd.Process.Pid, nil
+		}
+		if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT {
+			return false, err
+		}
+		// No file yet, continue to wait...
+		return false, nil
+	}
+	if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil {
+		// Don't log fatal error here, otherwise it will override the error logged
+		// by the child process that has failed to start.
+		log.Warningf("Unexpected error waiting for PID file, err: %v", err)
+		return subcommands.ExitFailure
+	}
+
+	*waitStatus = 0
+	return subcommands.ExitSuccess
+}
+
+// parseArgs parses exec information from the command line or a JSON file
+// depending on whether the --process flag was used. Returns an ExecArgs and
+// the ID of the container to be used.
+func (ex *Exec) parseArgs(f *flag.FlagSet, enableRaw bool) (*control.ExecArgs, string, error) {
+	if ex.processPath == "" {
+		// Requires at least a container ID and command.
+		if f.NArg() < 2 {
+			f.Usage()
+			return nil, "", fmt.Errorf("both a container-id and command are required")
+		}
+		e, err := ex.argsFromCLI(f.Args()[1:], enableRaw)
+		return e, f.Arg(0), err
+	}
+	// Requires only the container ID.
+	if f.NArg() != 1 {
+		f.Usage()
+		return nil, "", fmt.Errorf("a container-id is required")
+	}
+	e, err := ex.argsFromProcessFile(enableRaw)
+	return e, f.Arg(0), err
+}
+
+func (ex *Exec) argsFromCLI(argv []string, enableRaw bool) (*control.ExecArgs, error) {
+	extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
+	for _, s := range ex.extraKGIDs {
+		kgid, err := strconv.Atoi(s)
+		if err != nil {
+			Fatalf("parsing GID: %s, %v", s, err)
+		}
+		extraKGIDs = append(extraKGIDs, auth.KGID(kgid))
+	}
+
+	var caps *auth.TaskCapabilities
+	if len(ex.caps) > 0 {
+		var err error
+		caps, err = capabilities(ex.caps, enableRaw)
+		if err != nil {
+			return nil, fmt.Errorf("capabilities error: %v", err)
+		}
+	}
+
+	return &control.ExecArgs{
+		Argv:             argv,
+		WorkingDirectory: ex.cwd,
+		KUID:             ex.user.kuid,
+		KGID:             ex.user.kgid,
+		ExtraKGIDs:       extraKGIDs,
+		Capabilities:     caps,
+		StdioIsPty:       ex.consoleSocket != "",
+		FilePayload:      urpc.FilePayload{[]*os.File{os.Stdin, os.Stdout, os.Stderr}},
+	}, nil
+}
+
+func (ex *Exec) argsFromProcessFile(enableRaw bool) (*control.ExecArgs, error) {
+	f, err := os.Open(ex.processPath)
+	if err != nil {
+		return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
+	}
+	defer f.Close()
+	var p specs.Process
+	if err := json.NewDecoder(f).Decode(&p); err != nil {
+		return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
+	}
+	return argsFromProcess(&p, enableRaw)
+}
+
+// argsFromProcess performs all the non-IO conversion from the Process struct
+// to ExecArgs.
+func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error) {
+	// Create capabilities.
+	var caps *auth.TaskCapabilities
+	if p.Capabilities != nil {
+		var err error
+		// Starting from Docker 19, capabilities are explicitly set for exec (instead
+		// of nil like before). So we can't distinguish 'exec' from
+		// 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
+		// CAP_NET_RAW in the same way as container start.
+		caps, err = specutils.Capabilities(enableRaw, p.Capabilities)
+		if err != nil {
+			return nil, fmt.Errorf("error creating capabilities: %v", err)
+		}
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids))
+	for _, GID := range p.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	return &control.ExecArgs{
+		Argv:             p.Args,
+		Envv:             p.Env,
+		WorkingDirectory: p.Cwd,
+		KUID:             auth.KUID(p.User.UID),
+		KGID:             auth.KGID(p.User.GID),
+		ExtraKGIDs:       extraKGIDs,
+		Capabilities:     caps,
+		StdioIsPty:       p.Terminal,
+		FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+	}, nil
+}
+
+// resolveEnvs transforms lists of environment variables into a single list of
+// environment variables. If a variable is defined multiple times, the last
+// value is used.
+func resolveEnvs(envs ...[]string) ([]string, error) {
+	// First create a map of variable names to values. This removes any
+	// duplicates.
+	envMap := make(map[string]string)
+	for _, env := range envs {
+		for _, str := range env {
+			parts := strings.SplitN(str, "=", 2)
+			if len(parts) != 2 {
+				return nil, fmt.Errorf("invalid variable: %s", str)
+			}
+			envMap[parts[0]] = parts[1]
+		}
+	}
+	// Reassemble envMap into a list of environment variables of the form
+	// NAME=VALUE.
+	env := make([]string, 0, len(envMap))
+	for k, v := range envMap {
+		env = append(env, fmt.Sprintf("%s=%s", k, v))
+	}
+	return env, nil
+}
+
+// capabilities takes a list of capabilities as strings and returns an
+// auth.TaskCapabilities struct with those capabilities in every capability set.
+// This mimics runc's behavior.
+func capabilities(cs []string, enableRaw bool) (*auth.TaskCapabilities, error) {
+	var specCaps specs.LinuxCapabilities
+	for _, cap := range cs {
+		specCaps.Ambient = append(specCaps.Ambient, cap)
+		specCaps.Bounding = append(specCaps.Bounding, cap)
+		specCaps.Effective = append(specCaps.Effective, cap)
+		specCaps.Inheritable = append(specCaps.Inheritable, cap)
+		specCaps.Permitted = append(specCaps.Permitted, cap)
+	}
+	// Starting from Docker 19, capabilities are explicitly set for exec (instead
+	// of nil like before). So we can't distinguish 'exec' from
+	// 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
+	// CAP_NET_RAW in the same way as container start.
+	return specutils.Capabilities(enableRaw, &specCaps)
+}
+
+// stringSlice allows a flag to be used multiple times, where each occurrence
+// adds a value to the flag. For example, a flag called "x" could be invoked
+// via "runsc exec -x foo -x bar", and the corresponding stringSlice would be
+// {"x", "y"}.
+type stringSlice []string
+
+// String implements flag.Value.String.
+func (ss *stringSlice) String() string {
+	return fmt.Sprintf("%v", *ss)
+}
+
+// Get implements flag.Value.Get.
+func (ss *stringSlice) Get() interface{} {
+	return ss
+}
+
+// Set implements flag.Value.Set.
+func (ss *stringSlice) Set(s string) error {
+	*ss = append(*ss, s)
+	return nil
+}
+
+// user allows -user to convey a UID and, optionally, a GID separated by a
+// colon.
+type user struct {
+	kuid auth.KUID
+	kgid auth.KGID
+}
+
+func (u *user) String() string {
+	return fmt.Sprintf("%+v", *u)
+}
+
+func (u *user) Get() interface{} {
+	return u
+}
+
+func (u *user) Set(s string) error {
+	parts := strings.SplitN(s, ":", 2)
+	kuid, err := strconv.Atoi(parts[0])
+	if err != nil {
+		return fmt.Errorf("couldn't parse UID: %s", parts[0])
+	}
+	u.kuid = auth.KUID(kuid)
+	if len(parts) > 1 {
+		kgid, err := strconv.Atoi(parts[1])
+		if err != nil {
+			return fmt.Errorf("couldn't parse GID: %s", parts[1])
+		}
+		u.kgid = auth.KGID(kgid)
+	}
+	return nil
+}
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
new file mode 100644
index 000000000..a1e980d08
--- /dev/null
+++ b/runsc/cmd/exec_test.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"os"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/urpc"
+)
+
+func TestUser(t *testing.T) {
+	testCases := []struct {
+		input   string
+		want    user
+		wantErr bool
+	}{
+		{input: "0", want: user{kuid: 0, kgid: 0}},
+		{input: "7", want: user{kuid: 7, kgid: 0}},
+		{input: "49:343", want: user{kuid: 49, kgid: 343}},
+		{input: "0:2401", want: user{kuid: 0, kgid: 2401}},
+		{input: "", wantErr: true},
+		{input: "foo", wantErr: true},
+		{input: ":123", wantErr: true},
+		{input: "1:2:3", wantErr: true},
+	}
+
+	for _, tc := range testCases {
+		var u user
+		if err := u.Set(tc.input); err != nil && tc.wantErr {
+			// We got an error and wanted one.
+			continue
+		} else if err == nil && tc.wantErr {
+			t.Errorf("user.Set(%s): got no error, but wanted one", tc.input)
+		} else if err != nil && !tc.wantErr {
+			t.Errorf("user.Set(%s): got error %v, but wanted none", tc.input, err)
+		} else if u != tc.want {
+			t.Errorf("user.Set(%s): got %+v, but wanted %+v", tc.input, u, tc.want)
+		}
+	}
+}
+
+func TestCLIArgs(t *testing.T) {
+	testCases := []struct {
+		ex       Exec
+		argv     []string
+		expected control.ExecArgs
+	}{
+		{
+			ex: Exec{
+				cwd:         "/foo/bar",
+				user:        user{kuid: 0, kgid: 0},
+				extraKGIDs:  []string{"1", "2", "3"},
+				caps:        []string{"CAP_DAC_OVERRIDE"},
+				processPath: "",
+			},
+			argv: []string{"ls", "/"},
+			expected: control.ExecArgs{
+				Argv:             []string{"ls", "/"},
+				WorkingDirectory: "/foo/bar",
+				FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+				KUID:             0,
+				KGID:             0,
+				ExtraKGIDs:       []auth.KGID{1, 2, 3},
+				Capabilities: &auth.TaskCapabilities{
+					BoundingCaps:    auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					EffectiveCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					PermittedCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		e, err := tc.ex.argsFromCLI(tc.argv, true)
+		if err != nil {
+			t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err)
+		} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
+			t.Errorf("argsFromCLI(%+v): got %+v, but expected %+v", tc.ex, *e, tc.expected)
+		}
+	}
+}
+
+func TestJSONArgs(t *testing.T) {
+	testCases := []struct {
+		// ex is provided to make sure it is overridden by p.
+		ex       Exec
+		p        specs.Process
+		expected control.ExecArgs
+	}{
+		{
+			ex: Exec{
+				cwd:         "/baz/quux",
+				user:        user{kuid: 1, kgid: 1},
+				extraKGIDs:  []string{"4", "5", "6"},
+				caps:        []string{"CAP_SETGID"},
+				processPath: "/bin/foo",
+			},
+			p: specs.Process{
+				User: specs.User{UID: 0, GID: 0, AdditionalGids: []uint32{1, 2, 3}},
+				Args: []string{"ls", "/"},
+				Cwd:  "/foo/bar",
+				Capabilities: &specs.LinuxCapabilities{
+					Bounding:    []string{"CAP_DAC_OVERRIDE"},
+					Effective:   []string{"CAP_DAC_OVERRIDE"},
+					Inheritable: []string{"CAP_DAC_OVERRIDE"},
+					Permitted:   []string{"CAP_DAC_OVERRIDE"},
+				},
+			},
+			expected: control.ExecArgs{
+				Argv:             []string{"ls", "/"},
+				WorkingDirectory: "/foo/bar",
+				FilePayload:      urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}},
+				KUID:             0,
+				KGID:             0,
+				ExtraKGIDs:       []auth.KGID{1, 2, 3},
+				Capabilities: &auth.TaskCapabilities{
+					BoundingCaps:    auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					EffectiveCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+					PermittedCaps:   auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+				},
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		e, err := argsFromProcess(&tc.p, true)
+		if err != nil {
+			t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err)
+		} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
+			t.Errorf("argsFromProcess(%+v): got %+v, but expected %+v", tc.p, *e, tc.expected)
+		}
+	}
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
new file mode 100644
index 000000000..28f0d54b9
--- /dev/null
+++ b/runsc/cmd/gofer.go
@@ -0,0 +1,479 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"syscall"
+
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/fsgofer"
+	"gvisor.dev/gvisor/runsc/fsgofer/filter"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+var caps = []string{
+	"CAP_CHOWN",
+	"CAP_DAC_OVERRIDE",
+	"CAP_DAC_READ_SEARCH",
+	"CAP_FOWNER",
+	"CAP_FSETID",
+	"CAP_SYS_CHROOT",
+}
+
+// goferCaps is the minimal set of capabilities needed by the Gofer to operate
+// on files.
+var goferCaps = &specs.LinuxCapabilities{
+	Bounding:  caps,
+	Effective: caps,
+	Permitted: caps,
+}
+
+// Gofer implements subcommands.Command for the "gofer" command, which starts a
+// filesystem gofer.  This command should not be called directly.
+type Gofer struct {
+	bundleDir string
+	ioFDs     intFlags
+	applyCaps bool
+	setUpRoot bool
+
+	panicOnWrite bool
+	specFD       int
+	mountsFD     int
+}
+
+// Name implements subcommands.Command.
+func (*Gofer) Name() string {
+	return "gofer"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Gofer) Synopsis() string {
+	return "launch a gofer process that serves files over 9P protocol (internal use only)"
+}
+
+// Usage implements subcommands.Command.
+func (*Gofer) Usage() string {
+	return `gofer [flags]`
+}
+
+// SetFlags implements subcommands.Command.
+func (g *Gofer) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
+	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
+	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
+	f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
+	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
+	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
+	f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
+}
+
+// Execute implements subcommands.Command.
+func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	specFile := os.NewFile(uintptr(g.specFD), "spec file")
+	defer specFile.Close()
+	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
+	if err != nil {
+		Fatalf("reading spec: %v", err)
+	}
+
+	conf := args[0].(*boot.Config)
+
+	if g.setUpRoot {
+		if err := setupRootFS(spec, conf); err != nil {
+			Fatalf("Error setting up root FS: %v", err)
+		}
+	}
+	if g.applyCaps {
+		// Disable caps when calling myself again.
+		// Note: minimal argument handling for the default case to keep it simple.
+		args := os.Args
+		args = append(args, "--apply-caps=false", "--setup-root=false")
+		if err := setCapsAndCallSelf(args, goferCaps); err != nil {
+			Fatalf("Unable to apply caps: %v", err)
+		}
+		panic("unreachable")
+	}
+
+	// Find what path is going to be served by this gofer.
+	root := spec.Root.Path
+	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		root = "/root"
+	}
+
+	// Resolve mount points paths, then replace mounts from our spec and send the
+	// mount list over to the sandbox, so they are both in sync.
+	//
+	// Note that all mount points have been mounted in the proper location in
+	// setupRootFS().
+	cleanMounts, err := resolveMounts(conf, spec.Mounts, root)
+	if err != nil {
+		Fatalf("Failure to resolve mounts: %v", err)
+	}
+	spec.Mounts = cleanMounts
+	go func() {
+		if err := g.writeMounts(cleanMounts); err != nil {
+			panic(fmt.Sprintf("Failed to write mounts: %v", err))
+		}
+	}()
+
+	specutils.LogSpec(spec)
+
+	// fsgofer should run with a umask of 0, because we want to preserve file
+	// modes exactly as sent by the sandbox, which will have applied its own umask.
+	syscall.Umask(0)
+
+	if err := fsgofer.OpenProcSelfFD(); err != nil {
+		Fatalf("failed to open /proc/self/fd: %v", err)
+	}
+
+	if err := syscall.Chroot(root); err != nil {
+		Fatalf("failed to chroot to %q: %v", root, err)
+	}
+	if err := syscall.Chdir("/"); err != nil {
+		Fatalf("changing working dir: %v", err)
+	}
+	log.Infof("Process chroot'd to %q", root)
+
+	// Start with root mount, then add any other additional mount as needed.
+	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
+	ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
+		ROMount:      spec.Root.Readonly,
+		PanicOnWrite: g.panicOnWrite,
+	})
+	if err != nil {
+		Fatalf("creating attach point: %v", err)
+	}
+	ats = append(ats, ap)
+	log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly)
+
+	mountIdx := 1 // first one is the root
+	for _, m := range spec.Mounts {
+		if specutils.Is9PMount(m) {
+			cfg := fsgofer.Config{
+				ROMount:      isReadonlyMount(m.Options),
+				PanicOnWrite: g.panicOnWrite,
+				HostUDS:      conf.FSGoferHostUDS,
+			}
+			ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
+			if err != nil {
+				Fatalf("creating attach point: %v", err)
+			}
+			ats = append(ats, ap)
+
+			if mountIdx >= len(g.ioFDs) {
+				Fatalf("no FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m)
+			}
+			log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfg.ROMount)
+			mountIdx++
+		}
+	}
+	if mountIdx != len(g.ioFDs) {
+		Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
+	}
+
+	if conf.FSGoferHostUDS {
+		filter.InstallUDSFilters()
+	}
+
+	if err := filter.Install(); err != nil {
+		Fatalf("installing seccomp filters: %v", err)
+	}
+
+	runServers(ats, g.ioFDs)
+	return subcommands.ExitSuccess
+}
+
+func runServers(ats []p9.Attacher, ioFDs []int) {
+	// Run the loops and wait for all to exit.
+	var wg sync.WaitGroup
+	for i, ioFD := range ioFDs {
+		wg.Add(1)
+		go func(ioFD int, at p9.Attacher) {
+			socket, err := unet.NewSocket(ioFD)
+			if err != nil {
+				Fatalf("creating server on FD %d: %v", ioFD, err)
+			}
+			s := p9.NewServer(at)
+			if err := s.Handle(socket); err != nil {
+				Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)
+			}
+			wg.Done()
+		}(ioFD, ats[i])
+	}
+	wg.Wait()
+	log.Infof("All 9P servers exited.")
+}
+
+func (g *Gofer) writeMounts(mounts []specs.Mount) error {
+	bytes, err := json.Marshal(mounts)
+	if err != nil {
+		return err
+	}
+
+	f := os.NewFile(uintptr(g.mountsFD), "mounts file")
+	defer f.Close()
+
+	for written := 0; written < len(bytes); {
+		w, err := f.Write(bytes[written:])
+		if err != nil {
+			return err
+		}
+		written += w
+	}
+	return nil
+}
+
+func isReadonlyMount(opts []string) bool {
+	for _, o := range opts {
+		if o == "ro" {
+			return true
+		}
+	}
+	return false
+}
+
+func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
+	// Convert all shared mounts into slaves to be sure that nothing will be
+	// propagated outside of our namespace.
+	if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
+		Fatalf("error converting mounts: %v", err)
+	}
+
+	root := spec.Root.Path
+	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
+		// mount ./proc and ./root there, then move this mount to the root and after
+		// setCapsAndCallSelf, runsc will chroot into /root.
+		//
+		// We need a directory to construct a new root and we know that
+		// runsc can't start without /proc, so we can use it for this.
+		flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC)
+		if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil {
+			Fatalf("error mounting tmpfs: %v", err)
+		}
+
+		// Prepare tree structure for pivot_root(2).
+		os.Mkdir("/proc/proc", 0755)
+		os.Mkdir("/proc/root", 0755)
+		if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil {
+			Fatalf("error mounting proc: %v", err)
+		}
+		root = "/proc/root"
+	}
+
+	// Mount root path followed by submounts.
+	if err := syscall.Mount(spec.Root.Path, root, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
+		return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
+	}
+
+	flags := uint32(syscall.MS_SLAVE | syscall.MS_REC)
+	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
+		flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
+	}
+	if err := syscall.Mount("", root, "", uintptr(flags), ""); err != nil {
+		return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
+	}
+
+	// Replace the current spec, with the clean spec with symlinks resolved.
+	if err := setupMounts(spec.Mounts, root); err != nil {
+		Fatalf("error setting up FS: %v", err)
+	}
+
+	// Create working directory if needed.
+	if spec.Process.Cwd != "" {
+		dst, err := resolveSymlinks(root, spec.Process.Cwd)
+		if err != nil {
+			return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
+		}
+		if err := os.MkdirAll(dst, 0755); err != nil {
+			return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
+		}
+	}
+
+	// Check if root needs to be remounted as readonly.
+	if spec.Root.Readonly {
+		// If root is a mount point but not read-only, we can change mount options
+		// to make it read-only for extra safety.
+		log.Infof("Remounting root as readonly: %q", root)
+		flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
+		if err := syscall.Mount(root, root, "bind", flags, ""); err != nil {
+			return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
+		}
+	}
+
+	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		if err := pivotRoot("/proc"); err != nil {
+			Fatalf("failed to change the root file system: %v", err)
+		}
+		if err := os.Chdir("/"); err != nil {
+			Fatalf("failed to change working directory")
+		}
+	}
+	return nil
+}
+
+// setupMounts binds mount all mounts specified in the spec in their correct
+// location inside root. It will resolve relative paths and symlinks. It also
+// creates directories as needed.
+func setupMounts(mounts []specs.Mount, root string) error {
+	for _, m := range mounts {
+		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+			continue
+		}
+
+		dst, err := resolveSymlinks(root, m.Destination)
+		if err != nil {
+			return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
+		}
+
+		flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND
+		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
+		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
+			return fmt.Errorf("mounting %v: %v", m, err)
+		}
+
+		// Set propagation options that cannot be set together with other options.
+		flags = specutils.PropOptionsToFlags(m.Options)
+		if flags != 0 {
+			if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
+				return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err)
+			}
+		}
+	}
+	return nil
+}
+
+// resolveMounts resolved relative paths and symlinks to mount points.
+//
+// Note: mount points must already be in place for resolution to work.
+// Otherwise, it may follow symlinks to locations that would be overwritten
+// with another mount point and return the wrong location. In short, make sure
+// setupMounts() has been called before.
+func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
+	cleanMounts := make([]specs.Mount, 0, len(mounts))
+	for _, m := range mounts {
+		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
+			cleanMounts = append(cleanMounts, m)
+			continue
+		}
+		dst, err := resolveSymlinks(root, m.Destination)
+		if err != nil {
+			return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
+		}
+		relDst, err := filepath.Rel(root, dst)
+		if err != nil {
+			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
+		}
+
+		opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
+		if err != nil {
+			return nil, err
+		}
+
+		cpy := m
+		cpy.Destination = filepath.Join("/", relDst)
+		cpy.Options = opts
+		cleanMounts = append(cleanMounts, cpy)
+	}
+	return cleanMounts, nil
+}
+
+// ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are
+// symlinks, they are evaluated relative to 'root' to ensure the end result is
+// the same as if the process was running inside the container.
+func resolveSymlinks(root, rel string) (string, error) {
+	return resolveSymlinksImpl(root, root, rel, 255)
+}
+
+func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
+	if followCount == 0 {
+		return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
+	}
+
+	rel = filepath.Clean(rel)
+	for _, name := range strings.Split(rel, string(filepath.Separator)) {
+		if name == "" {
+			continue
+		}
+		// Note that Join() resolves things like ".." and returns a clean path.
+		path := filepath.Join(base, name)
+		if !strings.HasPrefix(path, root) {
+			// One cannot '..' their way out of root.
+			base = root
+			continue
+		}
+		fi, err := os.Lstat(path)
+		if err != nil {
+			if !os.IsNotExist(err) {
+				return "", err
+			}
+			// Not found means there is no symlink to check. Just keep walking dirs.
+			base = path
+			continue
+		}
+		if fi.Mode()&os.ModeSymlink != 0 {
+			link, err := os.Readlink(path)
+			if err != nil {
+				return "", err
+			}
+			if filepath.IsAbs(link) {
+				base = root
+			}
+			base, err = resolveSymlinksImpl(root, base, link, followCount-1)
+			if err != nil {
+				return "", err
+			}
+			continue
+		}
+		base = path
+	}
+	return base, nil
+}
+
+// adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
+func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) {
+	rv := make([]string, len(opts))
+	copy(rv, opts)
+
+	if conf.OverlayfsStaleRead {
+		statfs := syscall.Statfs_t{}
+		if err := syscall.Statfs(path, &statfs); err != nil {
+			return nil, err
+		}
+		if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC {
+			rv = append(rv, "overlayfs_stale_read")
+		}
+	}
+	return rv, nil
+}
diff --git a/runsc/cmd/gofer_test.go b/runsc/cmd/gofer_test.go
new file mode 100644
index 000000000..cbea7f127
--- /dev/null
+++ b/runsc/cmd/gofer_test.go
@@ -0,0 +1,164 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"path/filepath"
+	"testing"
+)
+
+func tmpDir() string {
+	dir := os.Getenv("TEST_TMPDIR")
+	if dir == "" {
+		dir = "/tmp"
+	}
+	return dir
+}
+
+type dir struct {
+	rel  string
+	link string
+}
+
+func construct(root string, dirs []dir) error {
+	for _, d := range dirs {
+		p := path.Join(root, d.rel)
+		if d.link == "" {
+			if err := os.MkdirAll(p, 0755); err != nil {
+				return fmt.Errorf("error creating dir: %v", err)
+			}
+		} else {
+			if err := os.MkdirAll(path.Dir(p), 0755); err != nil {
+				return fmt.Errorf("error creating dir: %v", err)
+			}
+			if err := os.Symlink(d.link, p); err != nil {
+				return fmt.Errorf("error creating symlink: %v", err)
+			}
+		}
+	}
+	return nil
+}
+
+func TestResolveSymlinks(t *testing.T) {
+	root, err := ioutil.TempDir(tmpDir(), "root")
+	if err != nil {
+		t.Fatal("ioutil.TempDir() failed:", err)
+	}
+	dirs := []dir{
+		{"dir1/dir11/dir111/dir1111", ""}, // Just a boring dir
+		{"dir1/lnk12", "dir11"},           // Link to sibling
+		{"dir1/lnk13", "./dir11"},         // Link to sibling through self
+		{"dir1/lnk14", "../dir1/dir11"},   // Link to sibling through parent
+		{"dir1/dir15/lnk151", ".."},       // Link to parent
+		{"dir1/lnk16", "dir11/dir111"},    // Link to child
+		{"dir1/lnk17", "."},               // Link to self
+		{"dir1/lnk18", "lnk13"},           // Link to link
+		{"lnk2", "dir1/lnk13"},            // Link to link to link
+		{"dir3/dir21/lnk211", "../.."},    // Link to root relative
+		{"dir3/lnk22", "/"},               // Link to root absolute
+		{"dir3/lnk23", "/dir1"},           // Link to dir absolute
+		{"dir3/lnk24", "/dir1/lnk12"},     // Link to link absolute
+		{"lnk5", "../../.."},              // Link outside root
+	}
+	if err := construct(root, dirs); err != nil {
+		t.Fatal("construct failed:", err)
+	}
+
+	tests := []struct {
+		name        string
+		rel         string
+		want        string
+		compareHost bool
+	}{
+		{name: "root", rel: "/", want: "/", compareHost: true},
+		{name: "basic dir", rel: "/dir1/dir11/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dot 1", rel: "/dir1/dir11/./dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dot 2", rel: "/dir1/././dir11/./././././dir111/.", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "dotdot 1", rel: "/dir1/dir11/../dir15", want: "/dir1/dir15", compareHost: true},
+		{name: "dotdot 2", rel: "/dir1/dir11/dir1111/../..", want: "/dir1", compareHost: true},
+
+		{name: "link sibling", rel: "/dir1/lnk12", want: "/dir1/dir11", compareHost: true},
+		{name: "link sibling + dir", rel: "/dir1/lnk12/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link sibling through self", rel: "/dir1/lnk13", want: "/dir1/dir11", compareHost: true},
+		{name: "link sibling through parent", rel: "/dir1/lnk14", want: "/dir1/dir11", compareHost: true},
+
+		{name: "link parent", rel: "/dir1/dir15/lnk151", want: "/dir1", compareHost: true},
+		{name: "link parent + dir", rel: "/dir1/dir15/lnk151/dir11", want: "/dir1/dir11", compareHost: true},
+		{name: "link child", rel: "/dir1/lnk16", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link child + dir", rel: "/dir1/lnk16/dir1111", want: "/dir1/dir11/dir111/dir1111", compareHost: true},
+		{name: "link self", rel: "/dir1/lnk17", want: "/dir1", compareHost: true},
+		{name: "link self + dir", rel: "/dir1/lnk17/dir11", want: "/dir1/dir11", compareHost: true},
+
+		{name: "link^2", rel: "/dir1/lnk18", want: "/dir1/dir11", compareHost: true},
+		{name: "link^2 + dir", rel: "/dir1/lnk18/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+		{name: "link^3", rel: "/lnk2", want: "/dir1/dir11", compareHost: true},
+		{name: "link^3 + dir", rel: "/lnk2/dir111", want: "/dir1/dir11/dir111", compareHost: true},
+
+		{name: "link abs", rel: "/dir3/lnk23", want: "/dir1"},
+		{name: "link abs + dir", rel: "/dir3/lnk23/dir11", want: "/dir1/dir11"},
+		{name: "link^2 abs", rel: "/dir3/lnk24", want: "/dir1/dir11"},
+		{name: "link^2 abs + dir", rel: "/dir3/lnk24/dir111", want: "/dir1/dir11/dir111"},
+
+		{name: "root link rel", rel: "/dir3/dir21/lnk211", want: "/", compareHost: true},
+		{name: "root link abs", rel: "/dir3/lnk22", want: "/"},
+		{name: "root contain link", rel: "/lnk5/dir1", want: "/dir1"},
+		{name: "root contain dotdot", rel: "/dir1/dir11/../../../../../../../..", want: "/"},
+
+		{name: "crazy", rel: "/dir3/dir21/lnk211/dir3/lnk22/dir1/dir11/../../lnk5/dir3/../dir3/lnk24/dir111/dir1111/..", want: "/dir1/dir11/dir111"},
+	}
+	for _, tst := range tests {
+		t.Run(tst.name, func(t *testing.T) {
+			got, err := resolveSymlinks(root, tst.rel)
+			if err != nil {
+				t.Errorf("resolveSymlinks(root, %q) failed: %v", tst.rel, err)
+			}
+			want := path.Join(root, tst.want)
+			if got != want {
+				t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, got, want)
+			}
+			if tst.compareHost {
+				// Check that host got to the same end result.
+				host, err := filepath.EvalSymlinks(path.Join(root, tst.rel))
+				if err != nil {
+					t.Errorf("path.EvalSymlinks(root, %q) failed: %v", tst.rel, err)
+				}
+				if host != got {
+					t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, host, got)
+				}
+			}
+		})
+	}
+}
+
+func TestResolveSymlinksLoop(t *testing.T) {
+	root, err := ioutil.TempDir(tmpDir(), "root")
+	if err != nil {
+		t.Fatal("ioutil.TempDir() failed:", err)
+	}
+	dirs := []dir{
+		{"loop1", "loop2"},
+		{"loop2", "loop1"},
+	}
+	if err := construct(root, dirs); err != nil {
+		t.Fatal("construct failed:", err)
+	}
+	if _, err := resolveSymlinks(root, "loop1"); err == nil {
+		t.Errorf("resolveSymlinks() should have failed")
+	}
+}
diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go
new file mode 100644
index 000000000..c7d210140
--- /dev/null
+++ b/runsc/cmd/help.go
@@ -0,0 +1,126 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// NewHelp returns a help command for the given commander.
+func NewHelp(cdr *subcommands.Commander) *Help {
+	return &Help{
+		cdr: cdr,
+	}
+}
+
+// Help implements subcommands.Command for the "help" command. The 'help'
+// command prints help for commands registered to a Commander but also allows for
+// registering additional help commands that print other documentation.
+type Help struct {
+	cdr      *subcommands.Commander
+	commands []subcommands.Command
+	help     bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Help) Name() string {
+	return "help"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Help) Synopsis() string {
+	return "Print help documentation."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Help) Usage() string {
+	return `help [<subcommand>]:
+	With an argument, prints detailed information on the use of
+	the specified topic or subcommand. With no argument, print a list of
+	all commands and a brief description of each.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (h *Help) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	switch f.NArg() {
+	case 0:
+		fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name())
+		fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open
+Container Initiative (OCI) format. Applications run by runsc are run in an
+isolated gVisor sandbox that emulates a Linux environment.
+
+gVisor is a user-space kernel, written in Go, that implements a substantial
+portion of the Linux system call interface. It provides an additional layer
+of isolation between running applications and the host operating system.
+
+Functionality is provided by subcommands. For additonal help on individual
+subcommands use "%s %s <subcommand>".
+
+`, h.cdr.Name(), h.Name())
+		h.cdr.VisitGroups(func(g *subcommands.CommandGroup) {
+			h.cdr.ExplainGroup(h.cdr.Output, g)
+		})
+
+		fmt.Fprintf(h.cdr.Output, "Additional help topics (Use \"%s %s <topic>\" to see help on the topic):\n", h.cdr.Name(), h.Name())
+		for _, cmd := range h.commands {
+			fmt.Fprintf(h.cdr.Output, "\t%-15s  %s\n", cmd.Name(), cmd.Synopsis())
+		}
+		fmt.Fprintf(h.cdr.Output, "\nUse \"%s flags\" for a list of top-level flags\n", h.cdr.Name())
+		return subcommands.ExitSuccess
+	default:
+		// Look for commands registered to the commander and print help explanation if found.
+		found := false
+		h.cdr.VisitCommands(func(g *subcommands.CommandGroup, cmd subcommands.Command) {
+			if f.Arg(0) == cmd.Name() {
+				h.cdr.ExplainCommand(h.cdr.Output, cmd)
+				found = true
+			}
+		})
+		if found {
+			return subcommands.ExitSuccess
+		}
+
+		// Next check commands registered to the help command.
+		for _, cmd := range h.commands {
+			if f.Arg(0) == cmd.Name() {
+				fs := flag.NewFlagSet(f.Arg(0), flag.ContinueOnError)
+				fs.Usage = func() { h.cdr.ExplainCommand(h.cdr.Error, cmd) }
+				cmd.SetFlags(fs)
+				if fs.Parse(f.Args()[1:]) != nil {
+					return subcommands.ExitUsageError
+				}
+				return cmd.Execute(ctx, f, args...)
+			}
+		}
+
+		fmt.Fprintf(h.cdr.Error, "Subcommand %s not understood\n", f.Arg(0))
+	}
+
+	f.Usage()
+	return subcommands.ExitUsageError
+}
+
+// Register registers a new help command.
+func (h *Help) Register(cmd subcommands.Command) {
+	h.commands = append(h.commands, cmd)
+}
diff --git a/runsc/cmd/install.go b/runsc/cmd/install.go
new file mode 100644
index 000000000..2e223e3be
--- /dev/null
+++ b/runsc/cmd/install.go
@@ -0,0 +1,210 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"path"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Install implements subcommands.Command.
+type Install struct {
+	ConfigFile   string
+	Runtime      string
+	Experimental bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Install) Name() string {
+	return "install"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Install) Synopsis() string {
+	return "adds a runtime to docker daemon configuration"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Install) Usage() string {
+	return `install [flags] <name> [-- [args...]] -- if provided, args are passed to the runtime
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (i *Install) SetFlags(fs *flag.FlagSet) {
+	fs.StringVar(&i.ConfigFile, "config_file", "/etc/docker/daemon.json", "path to Docker daemon config file")
+	fs.StringVar(&i.Runtime, "runtime", "runsc", "runtime name")
+	fs.BoolVar(&i.Experimental, "experimental", false, "enable experimental features")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (i *Install) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	// Grab the name and arguments.
+	runtimeArgs := f.Args()
+
+	// Extract the executable.
+	path, err := os.Executable()
+	if err != nil {
+		log.Fatalf("Error reading current exectuable: %v", err)
+	}
+
+	// Load the configuration file.
+	c, err := readConfig(i.ConfigFile)
+	if err != nil {
+		log.Fatalf("Error reading config file %q: %v", i.ConfigFile, err)
+	}
+
+	// Add the given runtime.
+	var rts map[string]interface{}
+	if i, ok := c["runtimes"]; ok {
+		rts = i.(map[string]interface{})
+	} else {
+		rts = make(map[string]interface{})
+		c["runtimes"] = rts
+	}
+	rts[i.Runtime] = struct {
+		Path        string   `json:"path,omitempty"`
+		RuntimeArgs []string `json:"runtimeArgs,omitempty"`
+	}{
+		Path:        path,
+		RuntimeArgs: runtimeArgs,
+	}
+
+	// Set experimental if required.
+	if i.Experimental {
+		c["experimental"] = true
+	}
+
+	// Write out the runtime.
+	if err := writeConfig(c, i.ConfigFile); err != nil {
+		log.Fatalf("Error writing config file %q: %v", i.ConfigFile, err)
+	}
+
+	// Success.
+	log.Printf("Added runtime %q with arguments %v to %q.", i.Runtime, runtimeArgs, i.ConfigFile)
+	return subcommands.ExitSuccess
+}
+
+// Uninstall implements subcommands.Command.
+type Uninstall struct {
+	ConfigFile string
+	Runtime    string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Uninstall) Name() string {
+	return "uninstall"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Uninstall) Synopsis() string {
+	return "removes a runtime from docker daemon configuration"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Uninstall) Usage() string {
+	return `uninstall [flags] <name>
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (u *Uninstall) SetFlags(fs *flag.FlagSet) {
+	fs.StringVar(&u.ConfigFile, "config_file", "/etc/docker/daemon.json", "path to Docker daemon config file")
+	fs.StringVar(&u.Runtime, "runtime", "runsc", "runtime name")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (u *Uninstall) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	log.Printf("Removing runtime %q from %q.", u.Runtime, u.ConfigFile)
+
+	c, err := readConfig(u.ConfigFile)
+	if err != nil {
+		log.Fatalf("Error reading config file %q: %v", u.ConfigFile, err)
+	}
+
+	var rts map[string]interface{}
+	if i, ok := c["runtimes"]; ok {
+		rts = i.(map[string]interface{})
+	} else {
+		log.Fatalf("runtime %q not found", u.Runtime)
+	}
+	if _, ok := rts[u.Runtime]; !ok {
+		log.Fatalf("runtime %q not found", u.Runtime)
+	}
+	delete(rts, u.Runtime)
+
+	if err := writeConfig(c, u.ConfigFile); err != nil {
+		log.Fatalf("Error writing config file %q: %v", u.ConfigFile, err)
+	}
+	return subcommands.ExitSuccess
+}
+
+func readConfig(path string) (map[string]interface{}, error) {
+	// Read the configuration data.
+	configBytes, err := ioutil.ReadFile(path)
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+
+	// Unmarshal the configuration.
+	c := make(map[string]interface{})
+	if len(configBytes) > 0 {
+		if err := json.Unmarshal(configBytes, &c); err != nil {
+			return nil, err
+		}
+	}
+
+	return c, nil
+}
+
+func writeConfig(c map[string]interface{}, filename string) error {
+	// Marshal the configuration.
+	b, err := json.MarshalIndent(c, "", "    ")
+	if err != nil {
+		return err
+	}
+
+	// Copy the old configuration.
+	old, err := ioutil.ReadFile(filename)
+	if err != nil {
+		if !os.IsNotExist(err) {
+			return fmt.Errorf("error reading config file %q: %v", filename, err)
+		}
+	} else {
+		if err := ioutil.WriteFile(filename+"~", old, 0644); err != nil {
+			return fmt.Errorf("error backing up config file %q: %v", filename, err)
+		}
+	}
+
+	// Make the necessary directories.
+	if err := os.MkdirAll(path.Dir(filename), 0755); err != nil {
+		return fmt.Errorf("error creating config directory for %q: %v", filename, err)
+	}
+
+	// Write the new configuration.
+	if err := ioutil.WriteFile(filename, b, 0644); err != nil {
+		return fmt.Errorf("error writing config file %q: %v", filename, err)
+	}
+
+	return nil
+}
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
new file mode 100644
index 000000000..8282ea0e0
--- /dev/null
+++ b/runsc/cmd/kill.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"syscall"
+
+	"github.com/google/subcommands"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Kill implements subcommands.Command for the "kill" command.
+type Kill struct {
+	all bool
+	pid int
+}
+
+// Name implements subcommands.Command.Name.
+func (*Kill) Name() string {
+	return "kill"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Kill) Synopsis() string {
+	return "sends a signal to the container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Kill) Usage() string {
+	return `kill <container id> [signal]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (k *Kill) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&k.all, "all", false, "send the specified signal to all processes inside the container")
+	f.IntVar(&k.pid, "pid", 0, "send the specified signal to a specific process")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() == 0 || f.NArg() > 2 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	if k.pid != 0 && k.all {
+		Fatalf("it is invalid to specify both --all and --pid")
+	}
+
+	c, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading container: %v", err)
+	}
+
+	// The OCI command-line spec says that the signal should be specified
+	// via a flag, but runc (and things that call runc) pass it as an
+	// argument.
+	signal := f.Arg(1)
+	if signal == "" {
+		signal = "TERM"
+	}
+
+	sig, err := parseSignal(signal)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	if k.pid != 0 {
+		if err := c.SignalProcess(sig, int32(k.pid)); err != nil {
+			Fatalf("failed to signal pid %d: %v", k.pid, err)
+		}
+	} else {
+		if err := c.SignalContainer(sig, k.all); err != nil {
+			Fatalf("%v", err)
+		}
+	}
+	return subcommands.ExitSuccess
+}
+
+func parseSignal(s string) (syscall.Signal, error) {
+	n, err := strconv.Atoi(s)
+	if err == nil {
+		sig := syscall.Signal(n)
+		for _, msig := range signalMap {
+			if sig == msig {
+				return sig, nil
+			}
+		}
+		return -1, fmt.Errorf("unknown signal %q", s)
+	}
+	if sig, ok := signalMap[strings.TrimPrefix(strings.ToUpper(s), "SIG")]; ok {
+		return sig, nil
+	}
+	return -1, fmt.Errorf("unknown signal %q", s)
+}
+
+var signalMap = map[string]syscall.Signal{
+	"ABRT":   unix.SIGABRT,
+	"ALRM":   unix.SIGALRM,
+	"BUS":    unix.SIGBUS,
+	"CHLD":   unix.SIGCHLD,
+	"CLD":    unix.SIGCLD,
+	"CONT":   unix.SIGCONT,
+	"FPE":    unix.SIGFPE,
+	"HUP":    unix.SIGHUP,
+	"ILL":    unix.SIGILL,
+	"INT":    unix.SIGINT,
+	"IO":     unix.SIGIO,
+	"IOT":    unix.SIGIOT,
+	"KILL":   unix.SIGKILL,
+	"PIPE":   unix.SIGPIPE,
+	"POLL":   unix.SIGPOLL,
+	"PROF":   unix.SIGPROF,
+	"PWR":    unix.SIGPWR,
+	"QUIT":   unix.SIGQUIT,
+	"SEGV":   unix.SIGSEGV,
+	"STKFLT": unix.SIGSTKFLT,
+	"STOP":   unix.SIGSTOP,
+	"SYS":    unix.SIGSYS,
+	"TERM":   unix.SIGTERM,
+	"TRAP":   unix.SIGTRAP,
+	"TSTP":   unix.SIGTSTP,
+	"TTIN":   unix.SIGTTIN,
+	"TTOU":   unix.SIGTTOU,
+	"URG":    unix.SIGURG,
+	"USR1":   unix.SIGUSR1,
+	"USR2":   unix.SIGUSR2,
+	"VTALRM": unix.SIGVTALRM,
+	"WINCH":  unix.SIGWINCH,
+	"XCPU":   unix.SIGXCPU,
+	"XFSZ":   unix.SIGXFSZ,
+}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
new file mode 100644
index 000000000..d8d906fe3
--- /dev/null
+++ b/runsc/cmd/list.go
@@ -0,0 +1,117 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"text/tabwriter"
+	"time"
+
+	"github.com/google/subcommands"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// List implements subcommands.Command for the "list" command for the "list" command.
+type List struct {
+	quiet  bool
+	format string
+}
+
+// Name implements subcommands.command.name.
+func (*List) Name() string {
+	return "list"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*List) Synopsis() string {
+	return "list containers started by runsc with the given root"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*List) Usage() string {
+	return `list [flags]`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (l *List) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&l.quiet, "quiet", false, "only list container ids")
+	f.StringVar(&l.format, "format", "text", "output format: 'text' (default) or 'json'")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	conf := args[0].(*boot.Config)
+	ids, err := container.List(conf.RootDir)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	if l.quiet {
+		for _, id := range ids {
+			fmt.Println(id)
+		}
+		return subcommands.ExitSuccess
+	}
+
+	// Collect the containers.
+	var containers []*container.Container
+	for _, id := range ids {
+		c, err := container.Load(conf.RootDir, id)
+		if err != nil {
+			Fatalf("loading container %q: %v", id, err)
+		}
+		containers = append(containers, c)
+	}
+
+	switch l.format {
+	case "text":
+		// Print a nice table.
+		w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0)
+		fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n")
+		for _, c := range containers {
+			fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n",
+				c.ID,
+				c.SandboxPid(),
+				c.Status,
+				c.BundleDir,
+				c.CreatedAt.Format(time.RFC3339Nano),
+				c.Owner)
+		}
+		w.Flush()
+	case "json":
+		// Print just the states.
+		var states []specs.State
+		for _, c := range containers {
+			states = append(states, c.State())
+		}
+		if err := json.NewEncoder(os.Stdout).Encode(states); err != nil {
+			Fatalf("marshaling container state: %v", err)
+		}
+	default:
+		Fatalf("unknown list format %q", l.format)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go
new file mode 100644
index 000000000..0e9ef7fa5
--- /dev/null
+++ b/runsc/cmd/path.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"os"
+)
+
+// getwdOrDie returns the current working directory and dies if it cannot.
+func getwdOrDie() string {
+	wd, err := os.Getwd()
+	if err != nil {
+		Fatalf("getting current working directory: %v", err)
+	}
+	return wd
+}
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
new file mode 100644
index 000000000..6f95a9837
--- /dev/null
+++ b/runsc/cmd/pause.go
@@ -0,0 +1,68 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Pause implements subcommands.Command for the "pause" command.
+type Pause struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Pause) Name() string {
+	return "pause"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Pause) Synopsis() string {
+	return "pause suspends all processes in a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Pause) Usage() string {
+	return `pause <container id> - pause process in instance of container.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Pause) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	cont, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading container: %v", err)
+	}
+
+	if err := cont.Pause(); err != nil {
+		Fatalf("pause failed: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
new file mode 100644
index 000000000..7fb8041af
--- /dev/null
+++ b/runsc/cmd/ps.go
@@ -0,0 +1,86 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// PS implements subcommands.Command for the "ps" command.
+type PS struct {
+	format string
+}
+
+// Name implements subcommands.Command.Name.
+func (*PS) Name() string {
+	return "ps"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*PS) Synopsis() string {
+	return "ps displays the processes running inside a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*PS) Usage() string {
+	return "<container-id> [ps options]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (ps *PS) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&ps.format, "format", "table", "output format. Select one of: table or json (default: table)")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	c, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading sandbox: %v", err)
+	}
+	pList, err := c.Processes()
+	if err != nil {
+		Fatalf("getting processes for container: %v", err)
+	}
+
+	switch ps.format {
+	case "table":
+		fmt.Println(control.ProcessListToTable(pList))
+	case "json":
+		o, err := control.PrintPIDsJSON(pList)
+		if err != nil {
+			Fatalf("generating JSON: %v", err)
+		}
+		fmt.Println(o)
+	default:
+		Fatalf("unsupported format: %s", ps.format)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
new file mode 100644
index 000000000..72584b326
--- /dev/null
+++ b/runsc/cmd/restore.go
@@ -0,0 +1,119 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"path/filepath"
+	"syscall"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// Restore implements subcommands.Command for the "restore" command.
+type Restore struct {
+	// Restore flags are a super-set of those for Create.
+	Create
+
+	// imagePath is the path to the saved container image
+	imagePath string
+
+	// detach indicates that runsc has to start a process and exit without waiting it.
+	detach bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Restore) Name() string {
+	return "restore"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Restore) Synopsis() string {
+	return "restore a saved state of container (experimental)"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Restore) Usage() string {
+	return `restore [flags] <container id> - restore saved state of container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Restore) SetFlags(f *flag.FlagSet) {
+	r.Create.SetFlags(f)
+	f.StringVar(&r.imagePath, "image-path", "", "directory path to saved container image")
+	f.BoolVar(&r.detach, "detach", false, "detach from the container's process")
+
+	// Unimplemented flags necessary for compatibility with docker.
+
+	var nsr bool
+	f.BoolVar(&nsr, "no-subreaper", false, "ignored")
+
+	var wp string
+	f.StringVar(&wp, "work-path", "", "ignored")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", r.Name())
+	}
+
+	bundleDir := r.bundleDir
+	if bundleDir == "" {
+		bundleDir = getwdOrDie()
+	}
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		return Errorf("reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	if r.imagePath == "" {
+		return Errorf("image-path flag must be provided")
+	}
+
+	conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName)
+
+	runArgs := container.Args{
+		ID:            id,
+		Spec:          spec,
+		BundleDir:     bundleDir,
+		ConsoleSocket: r.consoleSocket,
+		PIDFile:       r.pidFile,
+		UserLog:       r.userLog,
+		Attached:      !r.detach,
+	}
+	ws, err := container.Run(conf, runArgs)
+	if err != nil {
+		return Errorf("running container: %v", err)
+	}
+	*waitStatus = ws
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
new file mode 100644
index 000000000..61a55a554
--- /dev/null
+++ b/runsc/cmd/resume.go
@@ -0,0 +1,69 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Resume implements subcommands.Command for the "resume" command.
+type Resume struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Resume) Name() string {
+	return "resume"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Resume) Synopsis() string {
+	return "Resume unpauses a paused container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Resume) Usage() string {
+	return `resume <container id> - resume a paused container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Resume) SetFlags(f *flag.FlagSet) {
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	cont, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading container: %v", err)
+	}
+
+	if err := cont.Resume(); err != nil {
+		Fatalf("resume failed: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
new file mode 100644
index 000000000..cf41581ad
--- /dev/null
+++ b/runsc/cmd/run.go
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"syscall"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// Run implements subcommands.Command for the "run" command.
+type Run struct {
+	// Run flags are a super-set of those for Create.
+	Create
+
+	// detach indicates that runsc has to start a process and exit without waiting it.
+	detach bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Run) Name() string {
+	return "run"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Run) Synopsis() string {
+	return "create and run a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Run) Usage() string {
+	return `run [flags] <container id> - create and run a secure container.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (r *Run) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&r.detach, "detach", false, "detach from the container's process")
+	r.Create.SetFlags(f)
+}
+
+// Execute implements subcommands.Command.Execute.
+func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+	waitStatus := args[1].(*syscall.WaitStatus)
+
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", r.Name())
+	}
+
+	bundleDir := r.bundleDir
+	if bundleDir == "" {
+		bundleDir = getwdOrDie()
+	}
+	spec, err := specutils.ReadSpec(bundleDir)
+	if err != nil {
+		return Errorf("reading spec: %v", err)
+	}
+	specutils.LogSpec(spec)
+
+	runArgs := container.Args{
+		ID:            id,
+		Spec:          spec,
+		BundleDir:     bundleDir,
+		ConsoleSocket: r.consoleSocket,
+		PIDFile:       r.pidFile,
+		UserLog:       r.userLog,
+		Attached:      !r.detach,
+	}
+	ws, err := container.Run(conf, runArgs)
+	if err != nil {
+		return Errorf("running container: %v", err)
+	}
+
+	*waitStatus = ws
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
new file mode 100644
index 000000000..8e2b36e85
--- /dev/null
+++ b/runsc/cmd/spec.go
@@ -0,0 +1,182 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+var specTemplate = []byte(`{
+	"ociVersion": "1.0.0",
+	"process": {
+		"terminal": true,
+		"user": {
+			"uid": 0,
+			"gid": 0
+		},
+		"args": [
+			"sh"
+		],
+		"env": [
+			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+			"TERM=xterm"
+		],
+		"cwd": "/",
+		"capabilities": {
+			"bounding": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"effective": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"inheritable": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"permitted": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"ambient": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			]
+		},
+		"rlimits": [
+			{
+				"type": "RLIMIT_NOFILE",
+				"hard": 1024,
+				"soft": 1024
+			}
+		]
+	},
+	"root": {
+		"path": "rootfs",
+		"readonly": true
+	},
+	"hostname": "runsc",
+	"mounts": [
+		{
+			"destination": "/proc",
+			"type": "proc",
+			"source": "proc"
+		},
+		{
+			"destination": "/dev",
+			"type": "tmpfs",
+			"source": "tmpfs",
+			"options": []
+		},
+		{
+			"destination": "/sys",
+			"type": "sysfs",
+			"source": "sysfs",
+			"options": [
+				"nosuid",
+				"noexec",
+				"nodev",
+				"ro"
+			]
+		}
+	],
+	"linux": {
+		"namespaces": [
+			{
+				"type": "pid"
+			},
+			{
+				"type": "network"
+			},
+			{
+				"type": "ipc"
+			},
+			{
+				"type": "uts"
+			},
+			{
+				"type": "mount"
+			}
+		]
+	}
+}`)
+
+// Spec implements subcommands.Command for the "spec" command.
+type Spec struct {
+	bundle string
+}
+
+// Name implements subcommands.Command.Name.
+func (*Spec) Name() string {
+	return "spec"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Spec) Synopsis() string {
+	return "create a new OCI bundle specification file"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Spec) Usage() string {
+	return `spec [options] - create a new OCI bundle specification file.
+
+The spec command creates a new specification file (config.json) for a new OCI bundle.
+
+The specification file is a starter file that runs the "sh" command in the container. You
+should edit the file to suit your needs. You can find out more about the format of the
+specification file by visiting the OCI runtime spec repository:
+https://github.com/opencontainers/runtime-spec/
+
+EXAMPLE:
+    $ mkdir -p bundle/rootfs
+    $ cd bundle
+    $ runsc spec
+    $ docker export $(docker create hello-world) | tar -xf - -C rootfs
+    $ sed -i 's;"sh";"/hello";' config.json
+    $ sudo runsc run hello
+
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (s *Spec) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	confPath := filepath.Join(s.bundle, "config.json")
+	if _, err := os.Stat(confPath); !os.IsNotExist(err) {
+		Fatalf("file %q already exists", confPath)
+	}
+
+	if err := ioutil.WriteFile(confPath, specTemplate, 0664); err != nil {
+		Fatalf("writing to %q: %v", confPath, err)
+	}
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
new file mode 100644
index 000000000..0205fd9f7
--- /dev/null
+++ b/runsc/cmd/start.go
@@ -0,0 +1,65 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Start implements subcommands.Command for the "start" command.
+type Start struct{}
+
+// Name implements subcommands.Command.Name.
+func (*Start) Name() string {
+	return "start"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Start) Synopsis() string {
+	return "start a secure container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Start) Usage() string {
+	return `start <container id> - start a secure container.`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*Start) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	c, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading container: %v", err)
+	}
+	if err := c.Start(conf); err != nil {
+		Fatalf("starting container: %v", err)
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
new file mode 100644
index 000000000..cf2413deb
--- /dev/null
+++ b/runsc/cmd/state.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"os"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// State implements subcommands.Command for the "state" command.
+type State struct{}
+
+// Name implements subcommands.Command.Name.
+func (*State) Name() string {
+	return "state"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*State) Synopsis() string {
+	return "get the state of a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*State) Usage() string {
+	return `state [flags] <container id> - get the state of a container`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*State) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	c, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading container: %v", err)
+	}
+	log.Debugf("Returning state for container %+v", c)
+
+	state := c.State()
+	log.Debugf("State: %+v", state)
+
+	// Write json-encoded state directly to stdout.
+	b, err := json.MarshalIndent(state, "", "  ")
+	if err != nil {
+		Fatalf("marshaling container state: %v", err)
+	}
+	os.Stdout.Write(b)
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go
new file mode 100644
index 000000000..e6f1907da
--- /dev/null
+++ b/runsc/cmd/statefile.go
@@ -0,0 +1,143 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/state"
+	"gvisor.dev/gvisor/pkg/state/statefile"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Statefile implements subcommands.Command for the "statefile" command.
+type Statefile struct {
+	list   bool
+	get    string
+	key    string
+	output string
+	html   bool
+}
+
+// Name implements subcommands.Command.
+func (*Statefile) Name() string {
+	return "state"
+}
+
+// Synopsis implements subcommands.Command.
+func (*Statefile) Synopsis() string {
+	return "shows information about a statefile"
+}
+
+// Usage implements subcommands.Command.
+func (*Statefile) Usage() string {
+	return `statefile [flags] <statefile>`
+}
+
+// SetFlags implements subcommands.Command.
+func (s *Statefile) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&s.list, "list", false, "lists the metdata in the statefile.")
+	f.StringVar(&s.get, "get", "", "extracts the given metadata key.")
+	f.StringVar(&s.key, "key", "", "the integrity key for the file.")
+	f.StringVar(&s.output, "output", "", "target to write the result.")
+	f.BoolVar(&s.html, "html", false, "outputs in HTML format.")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	// Check arguments.
+	if s.list && s.get != "" {
+		Fatalf("error: can't specify -list and -get simultaneously.")
+	}
+
+	// Setup output.
+	var output = os.Stdout // Default.
+	if s.output != "" {
+		f, err := os.OpenFile(s.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644)
+		if err != nil {
+			Fatalf("error opening output: %v", err)
+		}
+		defer func() {
+			if err := f.Close(); err != nil {
+				Fatalf("error flushing output: %v", err)
+			}
+		}()
+		output = f
+	}
+
+	// Open the file.
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+	input, err := os.Open(f.Arg(0))
+	if err != nil {
+		Fatalf("error opening input: %v\n", err)
+	}
+
+	if s.html {
+		fmt.Fprintf(output, "<html><body>\n")
+		defer fmt.Fprintf(output, "</body></html>\n")
+	}
+
+	// Dump the full file?
+	if !s.list && s.get == "" {
+		var key []byte
+		if s.key != "" {
+			key = []byte(s.key)
+		}
+		rc, _, err := statefile.NewReader(input, key)
+		if err != nil {
+			Fatalf("error parsing statefile: %v", err)
+		}
+		if err := state.PrettyPrint(output, rc, s.html); err != nil {
+			Fatalf("error printing state: %v", err)
+		}
+		return subcommands.ExitSuccess
+	}
+
+	// Load just the metadata.
+	metadata, err := statefile.MetadataUnsafe(input)
+	if err != nil {
+		Fatalf("error reading metadata: %v", err)
+	}
+
+	// Is it a single key?
+	if s.get != "" {
+		val, ok := metadata[s.get]
+		if !ok {
+			Fatalf("metadata key %s: not found", s.get)
+		}
+		fmt.Fprintf(output, "%s\n", val)
+		return subcommands.ExitSuccess
+	}
+
+	// List all keys.
+	if s.html {
+		fmt.Fprintf(output, " <ul>\n")
+		defer fmt.Fprintf(output, " </ul>\n")
+	}
+	for key := range metadata {
+		if s.html {
+			fmt.Fprintf(output, "  <li>%s</li>\n", key)
+		} else {
+			fmt.Fprintf(output, "%s\n", key)
+		}
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go
new file mode 100644
index 000000000..a37d66139
--- /dev/null
+++ b/runsc/cmd/syscalls.go
@@ -0,0 +1,356 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/csv"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strconv"
+	"text/tabwriter"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Syscalls implements subcommands.Command for the "syscalls" command.
+type Syscalls struct {
+	format   string
+	os       string
+	arch     string
+	filename string
+}
+
+// CompatibilityInfo is a map of system and architecture to compatibility doc.
+// Maps operating system to architecture to ArchInfo.
+type CompatibilityInfo map[string]map[string]ArchInfo
+
+// ArchInfo is compatibility doc for an architecture.
+type ArchInfo struct {
+	// Syscalls maps syscall number for the architecture to the doc.
+	Syscalls map[uintptr]SyscallDoc `json:"syscalls"`
+}
+
+// SyscallDoc represents a single item of syscall documentation.
+type SyscallDoc struct {
+	Name string `json:"name"`
+	num  uintptr
+
+	Support string   `json:"support"`
+	Note    string   `json:"note,omitempty"`
+	URLs    []string `json:"urls,omitempty"`
+}
+
+type outputFunc func(io.Writer, CompatibilityInfo) error
+
+var (
+	// The string name to use for printing compatibility for all OSes.
+	osAll = "all"
+
+	// The string name to use for printing compatibility for all architectures.
+	archAll = "all"
+
+	// A map of OS name to map of architecture name to syscall table.
+	syscallTableMap = make(map[string]map[string]*kernel.SyscallTable)
+
+	// A map of output type names to output functions.
+	outputMap = map[string]outputFunc{
+		"table": outputTable,
+		"json":  outputJSON,
+		"csv":   outputCSV,
+	}
+)
+
+// Name implements subcommands.Command.Name.
+func (*Syscalls) Name() string {
+	return "syscalls"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Syscalls) Synopsis() string {
+	return "Print compatibility information for syscalls."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Syscalls) Usage() string {
+	return `syscalls [options] - Print compatibility information for syscalls.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (s *Syscalls) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&s.format, "format", "table", "Output format (table, csv, json).")
+	f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)")
+	f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).")
+	f.StringVar(&s.filename, "filename", "", "Output filename (otherwise stdout).")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	out, ok := outputMap[s.format]
+	if !ok {
+		Fatalf("Unsupported output format %q", s.format)
+	}
+
+	// Build map of all supported architectures.
+	tables := kernel.SyscallTables()
+	for _, t := range tables {
+		osMap, ok := syscallTableMap[t.OS.String()]
+		if !ok {
+			osMap = make(map[string]*kernel.SyscallTable)
+			syscallTableMap[t.OS.String()] = osMap
+		}
+		osMap[t.Arch.String()] = t
+	}
+
+	// Build a map of the architectures we want to output.
+	info, err := getCompatibilityInfo(s.os, s.arch)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	w := os.Stdout // Default.
+	if s.filename != "" {
+		w, err = os.OpenFile(s.filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
+		if err != nil {
+			Fatalf("Error opening %q: %v", s.filename, err)
+		}
+	}
+	if err := out(w, info); err != nil {
+		Fatalf("Error writing output: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
+
+// getCompatibilityInfo returns compatibility info for the given OS name and
+// architecture name. Supports the special name 'all' for OS and architecture that
+// specifies that all supported OSes or architectures should be included.
+func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, error) {
+	info := CompatibilityInfo(make(map[string]map[string]ArchInfo))
+	if osName == osAll {
+		// Special processing for the 'all' OS name.
+		for osName, _ := range syscallTableMap {
+			info[osName] = make(map[string]ArchInfo)
+			// osName is a specific OS name.
+			if err := addToCompatibilityInfo(info, osName, archName); err != nil {
+				return info, err
+			}
+		}
+	} else {
+		// osName is a specific OS name.
+		info[osName] = make(map[string]ArchInfo)
+		if err := addToCompatibilityInfo(info, osName, archName); err != nil {
+			return info, err
+		}
+	}
+
+	return info, nil
+}
+
+// addToCompatibilityInfo adds ArchInfo for the given specific OS name and
+// architecture name. Supports the special architecture name 'all' to specify
+// that all supported architectures for the OS should be included.
+func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error {
+	if archName == archAll {
+		// Special processing for the 'all' architecture name.
+		for archName, _ := range syscallTableMap[osName] {
+			archInfo, err := getArchInfo(osName, archName)
+			if err != nil {
+				return err
+			}
+			info[osName][archName] = archInfo
+		}
+	} else {
+		// archName is a specific architecture name.
+		archInfo, err := getArchInfo(osName, archName)
+		if err != nil {
+			return err
+		}
+		info[osName][archName] = archInfo
+	}
+
+	return nil
+}
+
+// getArchInfo returns compatibility info for a specific OS and architecture.
+func getArchInfo(osName string, archName string) (ArchInfo, error) {
+	info := ArchInfo{}
+	info.Syscalls = make(map[uintptr]SyscallDoc)
+
+	t, ok := syscallTableMap[osName][archName]
+	if !ok {
+		return info, fmt.Errorf("syscall table for %s/%s not found", osName, archName)
+	}
+
+	for num, sc := range t.Table {
+		info.Syscalls[num] = SyscallDoc{
+			Name:    sc.Name,
+			num:     num,
+			Support: sc.SupportLevel.String(),
+			Note:    sc.Note,
+			URLs:    sc.URLs,
+		}
+	}
+
+	return info, nil
+}
+
+// outputTable outputs the syscall info in tabular format.
+func outputTable(w io.Writer, info CompatibilityInfo) error {
+	tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0)
+
+	// Linux
+	for osName, osInfo := range info {
+		for archName, archInfo := range osInfo {
+			// Print the OS/arch
+			fmt.Fprintf(w, "%s/%s:\n\n", osName, archName)
+
+			// Sort the syscalls for output in the table.
+			sortedCalls := []SyscallDoc{}
+			for _, sc := range archInfo.Syscalls {
+				sortedCalls = append(sortedCalls, sc)
+			}
+			sort.Slice(sortedCalls, func(i, j int) bool {
+				return sortedCalls[i].num < sortedCalls[j].num
+			})
+
+			// Write the header
+			_, err := fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n",
+				"NUM",
+				"NAME",
+				"SUPPORT",
+				"NOTE",
+			)
+			if err != nil {
+				return err
+			}
+
+			// Write each syscall entry
+			for _, sc := range sortedCalls {
+				_, err = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n",
+					strconv.FormatInt(int64(sc.num), 10),
+					sc.Name,
+					sc.Support,
+					sc.Note,
+				)
+				if err != nil {
+					return err
+				}
+				// Add issue urls to note.
+				for _, url := range sc.URLs {
+					_, err = fmt.Fprintf(tw, "%s\t%s\t%s\tSee: %s\t\n",
+						"",
+						"",
+						"",
+						url,
+					)
+					if err != nil {
+						return err
+					}
+				}
+			}
+
+			err = tw.Flush()
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+// outputJSON outputs the syscall info in JSON format.
+func outputJSON(w io.Writer, info CompatibilityInfo) error {
+	e := json.NewEncoder(w)
+	e.SetIndent("", "  ")
+	return e.Encode(info)
+}
+
+// numberedRow is aCSV row annotated by syscall number (used for sorting)
+type numberedRow struct {
+	num uintptr
+	row []string
+}
+
+// outputCSV outputs the syscall info in tabular format.
+func outputCSV(w io.Writer, info CompatibilityInfo) error {
+	csvWriter := csv.NewWriter(w)
+
+	// Linux
+	for osName, osInfo := range info {
+		for archName, archInfo := range osInfo {
+			// Sort the syscalls for output in the table.
+			sortedCalls := []numberedRow{}
+			for _, sc := range archInfo.Syscalls {
+				// Add issue urls to note.
+				note := sc.Note
+				for _, url := range sc.URLs {
+					note = fmt.Sprintf("%s\nSee: %s", note, url)
+				}
+
+				sortedCalls = append(sortedCalls, numberedRow{
+					num: sc.num,
+					row: []string{
+						osName,
+						archName,
+						strconv.FormatInt(int64(sc.num), 10),
+						sc.Name,
+						sc.Support,
+						note,
+					},
+				})
+			}
+			sort.Slice(sortedCalls, func(i, j int) bool {
+				return sortedCalls[i].num < sortedCalls[j].num
+			})
+
+			// Write the header
+			err := csvWriter.Write([]string{
+				"OS",
+				"Arch",
+				"Num",
+				"Name",
+				"Support",
+				"Note",
+			})
+			if err != nil {
+				return err
+			}
+
+			// Write each syscall entry
+			for _, sc := range sortedCalls {
+				err = csvWriter.Write(sc.row)
+				if err != nil {
+					return err
+				}
+			}
+
+			csvWriter.Flush()
+			err = csvWriter.Error()
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
new file mode 100644
index 000000000..29c0a15f0
--- /dev/null
+++ b/runsc/cmd/wait.go
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"os"
+	"syscall"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/container"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+const (
+	unsetPID = -1
+)
+
+// Wait implements subcommands.Command for the "wait" command.
+type Wait struct {
+	rootPID int
+	pid     int
+}
+
+// Name implements subcommands.Command.Name.
+func (*Wait) Name() string {
+	return "wait"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Wait) Synopsis() string {
+	return "wait on a process inside a container"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Wait) Usage() string {
+	return `wait [flags] <container id>`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (wt *Wait) SetFlags(f *flag.FlagSet) {
+	f.IntVar(&wt.rootPID, "rootpid", unsetPID, "select a PID in the sandbox root PID namespace to wait on instead of the container's root process")
+	f.IntVar(&wt.pid, "pid", unsetPID, "select a PID in the container's PID namespace to wait on instead of the container's root process")
+}
+
+// Execute implements subcommands.Command.Execute. It waits for a process in a
+// container to exit before returning.
+func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 1 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+	// You can't specify both -pid and -rootpid.
+	if wt.rootPID != unsetPID && wt.pid != unsetPID {
+		Fatalf("only one of -pid and -rootPid can be set")
+	}
+
+	id := f.Arg(0)
+	conf := args[0].(*boot.Config)
+
+	c, err := container.Load(conf.RootDir, id)
+	if err != nil {
+		Fatalf("loading container: %v", err)
+	}
+
+	var waitStatus syscall.WaitStatus
+	switch {
+	// Wait on the whole container.
+	case wt.rootPID == unsetPID && wt.pid == unsetPID:
+		ws, err := c.Wait()
+		if err != nil {
+			Fatalf("waiting on container %q: %v", c.ID, err)
+		}
+		waitStatus = ws
+	// Wait on a PID in the root PID namespace.
+	case wt.rootPID != unsetPID:
+		ws, err := c.WaitRootPID(int32(wt.rootPID))
+		if err != nil {
+			Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
+		}
+		waitStatus = ws
+	// Wait on a PID in the container's PID namespace.
+	case wt.pid != unsetPID:
+		ws, err := c.WaitPID(int32(wt.pid))
+		if err != nil {
+			Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
+		}
+		waitStatus = ws
+	}
+	result := waitResult{
+		ID:         id,
+		ExitStatus: exitStatus(waitStatus),
+	}
+	// Write json-encoded wait result directly to stdout.
+	if err := json.NewEncoder(os.Stdout).Encode(result); err != nil {
+		Fatalf("marshaling wait result: %v", err)
+	}
+	return subcommands.ExitSuccess
+}
+
+type waitResult struct {
+	ID         string `json:"id"`
+	ExitStatus int    `json:"exitStatus"`
+}
+
+// exitStatus returns the correct exit status for a process based on if it
+// was signaled or exited cleanly.
+func exitStatus(status syscall.WaitStatus) int {
+	if status.Signaled() {
+		return 128 + int(status.Signal())
+	}
+	return status.ExitStatus()
+}
diff --git a/runsc/console/BUILD b/runsc/console/BUILD
new file mode 100644
index 000000000..06924bccd
--- /dev/null
+++ b/runsc/console/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "console",
+    srcs = [
+        "console.go",
+    ],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "@com_github_kr_pty//:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/console/console.go b/runsc/console/console.go
new file mode 100644
index 000000000..64b23639a
--- /dev/null
+++ b/runsc/console/console.go
@@ -0,0 +1,63 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package console contains utilities for working with pty consols in runsc.
+package console
+
+import (
+	"fmt"
+	"net"
+	"os"
+
+	"github.com/kr/pty"
+	"golang.org/x/sys/unix"
+)
+
+// NewWithSocket creates pty master/slave pair, sends the master FD over the given
+// socket, and returns the slave.
+func NewWithSocket(socketPath string) (*os.File, error) {
+	// Create a new pty master and slave.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		return nil, fmt.Errorf("opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+
+	// Get a connection to the socket path.
+	conn, err := net.Dial("unix", socketPath)
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err)
+	}
+	defer conn.Close()
+	uc, ok := conn.(*net.UnixConn)
+	if !ok {
+		ptySlave.Close()
+		return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
+	}
+	socket, err := uc.File()
+	if err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err)
+	}
+	defer socket.Close()
+
+	// Send the master FD over the connection.
+	msg := unix.UnixRights(int(ptyMaster.Fd()))
+	if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
+		ptySlave.Close()
+		return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err)
+	}
+	return ptySlave, nil
+}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
new file mode 100644
index 000000000..46154df60
--- /dev/null
+++ b/runsc/container/BUILD
@@ -0,0 +1,72 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "container",
+    srcs = [
+        "container.go",
+        "hook.go",
+        "state_file.go",
+        "status.go",
+    ],
+    visibility = [
+        "//runsc:__subpackages__",
+        "//test:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/sentry/sighandling",
+        "//pkg/sync",
+        "//runsc/boot",
+        "//runsc/cgroup",
+        "//runsc/sandbox",
+        "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_gofrs_flock//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
+)
+
+go_test(
+    name = "container_test",
+    size = "large",
+    srcs = [
+        "console_test.go",
+        "container_norace_test.go",
+        "container_race_test.go",
+        "container_test.go",
+        "multi_container_test.go",
+        "shared_volume_test.go",
+    ],
+    data = [
+        "//runsc",
+        "//test/cmd/test_app",
+    ],
+    library = ":container",
+    shard_count = 5,
+    tags = [
+        "requires-kvm",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/bits",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sync",
+        "//pkg/test/testutil",
+        "//pkg/unet",
+        "//pkg/urpc",
+        "//runsc/boot",
+        "//runsc/boot/platforms",
+        "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
new file mode 100644
index 000000000..294dca5e7
--- /dev/null
+++ b/runsc/container/console_test.go
@@ -0,0 +1,480 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/kr/pty"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/pkg/urpc"
+)
+
+// socketPath creates a path inside bundleDir and ensures that the returned
+// path is under 108 charactors (the unix socket path length limit),
+// relativizing the path if necessary.
+func socketPath(bundleDir string) (string, error) {
+	path := filepath.Join(bundleDir, "socket")
+	cwd, err := os.Getwd()
+	if err != nil {
+		return "", fmt.Errorf("error getting cwd: %v", err)
+	}
+	relPath, err := filepath.Rel(cwd, path)
+	if err != nil {
+		return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err)
+	}
+	if len(path) > len(relPath) {
+		path = relPath
+	}
+	const maxPathLen = 108
+	if len(path) > maxPathLen {
+		return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path)
+	}
+	return path, nil
+}
+
+// createConsoleSocket creates a socket at the given path that will receive a
+// console fd from the sandbox. If an error occurs, t.Fatalf will be called.
+// The function returning should be deferred as cleanup.
+func createConsoleSocket(t *testing.T, path string) (*unet.ServerSocket, func()) {
+	t.Helper()
+	srv, err := unet.BindAndListen(path, false)
+	if err != nil {
+		t.Fatalf("error binding and listening to socket %q: %v", path, err)
+	}
+
+	cleanup := func() {
+		// Log errors; nothing can be done.
+		if err := srv.Close(); err != nil {
+			t.Logf("error closing socket %q: %v", path, err)
+		}
+		if err := os.Remove(path); err != nil {
+			t.Logf("error removing socket %q: %v", path, err)
+		}
+	}
+
+	return srv, cleanup
+}
+
+// receiveConsolePTY accepts a connection on the server socket and reads fds.
+// It fails if more than one FD is received, or if the FD is not a PTY. It
+// returns the PTY master file.
+func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
+	sock, err := srv.Accept()
+	if err != nil {
+		return nil, fmt.Errorf("error accepting socket connection: %v", err)
+	}
+
+	// Allow 3 fds to be received.  We only expect 1.
+	r := sock.Reader(true /* blocking */)
+	r.EnableFDs(1)
+
+	// The socket is closed right after sending the FD, so EOF is
+	// an allowed error.
+	b := [][]byte{{}}
+	if _, err := r.ReadVec(b); err != nil && err != io.EOF {
+		return nil, fmt.Errorf("error reading from socket connection: %v", err)
+	}
+
+	// We should have gotten a control message.
+	fds, err := r.ExtractFDs()
+	if err != nil {
+		return nil, fmt.Errorf("error extracting fds from socket connection: %v", err)
+	}
+	if len(fds) != 1 {
+		return nil, fmt.Errorf("got %d fds from socket, wanted 1", len(fds))
+	}
+
+	// Verify that the fd is a terminal.
+	if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil {
+		return nil, fmt.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
+	}
+
+	return os.NewFile(uintptr(fds[0]), "pty_master"), nil
+}
+
+// Test that an pty FD is sent over the console socket if one is provided.
+func TestConsoleSocket(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("true")
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			sock, err := socketPath(bundleDir)
+			if err != nil {
+				t.Fatalf("error getting socket path: %v", err)
+			}
+			srv, cleanup := createConsoleSocket(t, sock)
+			defer cleanup()
+
+			// Create the container and pass the socket name.
+			args := Args{
+				ID:            testutil.RandomContainerID(),
+				Spec:          spec,
+				BundleDir:     bundleDir,
+				ConsoleSocket: sock,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+
+			// Make sure we get a console PTY.
+			ptyMaster, err := receiveConsolePTY(srv)
+			if err != nil {
+				t.Fatalf("error receiving console FD: %v", err)
+			}
+			ptyMaster.Close()
+		})
+	}
+}
+
+// Test that job control signals work on a console created with "exec -ti".
+func TestJobControlSignalExec(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	conf := testutil.TestConfig(t)
+
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create and start the container.
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	c, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Create a pty master/slave. The slave will be passed to the exec
+	// process.
+	ptyMaster, ptySlave, err := pty.Open()
+	if err != nil {
+		t.Fatalf("error opening pty: %v", err)
+	}
+	defer ptyMaster.Close()
+	defer ptySlave.Close()
+
+	// Exec bash and attach a terminal. Note that occasionally /bin/sh
+	// may be a different shell or have a different configuration (such
+	// as disabling interactive mode and job control). Since we want to
+	// explicitly test interactive mode, use /bin/bash. See b/116981926.
+	execArgs := &control.ExecArgs{
+		Filename: "/bin/bash",
+		// Don't let bash execute from profile or rc files, otherwise
+		// our PID counts get messed up.
+		Argv: []string{"/bin/bash", "--noprofile", "--norc"},
+		// Pass the pty slave as FD 0, 1, and 2.
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{ptySlave, ptySlave, ptySlave},
+		},
+		StdioIsPty: true,
+	}
+
+	pid, err := c.Execute(execArgs)
+	if err != nil {
+		t.Fatalf("error executing: %v", err)
+	}
+	if pid != 2 {
+		t.Fatalf("exec got pid %d, wanted %d", pid, 2)
+	}
+
+	// Make sure all the processes are running.
+	expectedPL := []*control.Process{
+		// Root container process.
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+		// Bash from exec process.
+		{PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Execute sleep.
+	ptyMaster.Write([]byte("sleep 100\n"))
+
+	// Wait for it to start. Sleep's PPID is bash's PID.
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Send a SIGTERM to the foreground process for the exec PID. Note that
+	// although we pass in the PID of "bash", it should actually terminate
+	// "sleep", since that is the foreground process.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGTERM, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Sleep process should be gone.
+	expectedPL = expectedPL[:len(expectedPL)-1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Sleep is dead, but it may take more time for bash to notice and
+	// change the foreground process back to itself. We know it is done
+	// when bash writes "Terminated" to the pty.
+	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
+		t.Fatalf("bash did not take over pty: %v", err)
+	}
+
+	// Send a SIGKILL to the foreground process again. This time "bash"
+	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
+	// because bash ignores those.
+	if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGKILL, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+	expectedPL = expectedPL[:1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Make sure the process indicates it was killed by a SIGKILL.
+	ws, err := c.WaitPID(pid)
+	if err != nil {
+		t.Errorf("waiting on container failed: %v", err)
+	}
+	if !ws.Signaled() {
+		t.Error("ws.Signaled() got false, want true")
+	}
+	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
+		t.Errorf("ws.Signal() got %v, want %v", got, want)
+	}
+}
+
+// Test that job control signals work on a console created with "run -ti".
+func TestJobControlSignalRootContainer(t *testing.T) {
+	conf := testutil.TestConfig(t)
+	// Don't let bash execute from profile or rc files, otherwise our PID
+	// counts get messed up.
+	spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc")
+	spec.Process.Terminal = true
+
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	sock, err := socketPath(bundleDir)
+	if err != nil {
+		t.Fatalf("error getting socket path: %v", err)
+	}
+	srv, cleanup := createConsoleSocket(t, sock)
+	defer cleanup()
+
+	// Create the container and pass the socket name.
+	args := Args{
+		ID:            testutil.RandomContainerID(),
+		Spec:          spec,
+		BundleDir:     bundleDir,
+		ConsoleSocket: sock,
+	}
+	c, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+
+	// Get the PTY master.
+	ptyMaster, err := receiveConsolePTY(srv)
+	if err != nil {
+		t.Fatalf("error receiving console FD: %v", err)
+	}
+	defer ptyMaster.Close()
+
+	// Bash output as well as sandbox output will be written to the PTY
+	// file. Writes after a certain point will block unless we drain the
+	// PTY, so we must continually copy from it.
+	//
+	// We log the output to stderr for debugabilitly, and also to a buffer,
+	// since we wait on particular output from bash below. We use a custom
+	// blockingBuffer which is thread-safe and also blocks on Read calls,
+	// which makes this a suitable Reader for WaitUntilRead.
+	ptyBuf := newBlockingBuffer()
+	tee := io.TeeReader(ptyMaster, ptyBuf)
+	go io.Copy(os.Stderr, tee)
+
+	// Start the container.
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Start waiting for the container to exit in a goroutine. We do this
+	// very early, otherwise it might exit before we have a chance to call
+	// Wait.
+	var (
+		ws syscall.WaitStatus
+		wg sync.WaitGroup
+	)
+	wg.Add(1)
+	go func() {
+		var err error
+		ws, err = c.Wait()
+		if err != nil {
+			t.Errorf("error waiting on container: %v", err)
+		}
+		wg.Done()
+	}()
+
+	// Wait for bash to start.
+	expectedPL := []*control.Process{
+		{PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Fatalf("error waiting for processes: %v", err)
+	}
+
+	// Execute sleep via the terminal.
+	ptyMaster.Write([]byte("sleep 100\n"))
+
+	// Wait for sleep to start.
+	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Fatalf("error waiting for processes: %v", err)
+	}
+
+	// Reset the pty buffer, so there is less output for us to scan later.
+	ptyBuf.Reset()
+
+	// Send a SIGTERM to the foreground process. We pass PID=0, indicating
+	// that the root process should be killed. However, by setting
+	// fgProcess=true, the signal should actually be sent to sleep.
+	if err := c.Sandbox.SignalProcess(c.ID, 0 /* PID */, syscall.SIGTERM, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Sleep process should be gone.
+	expectedPL = expectedPL[:len(expectedPL)-1]
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Error(err)
+	}
+
+	// Sleep is dead, but it may take more time for bash to notice and
+	// change the foreground process back to itself. We know it is done
+	// when bash writes "Terminated" to the pty.
+	if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil {
+		t.Fatalf("bash did not take over pty: %v", err)
+	}
+
+	// Send a SIGKILL to the foreground process again. This time "bash"
+	// should be killed. We use SIGKILL instead of SIGTERM or SIGINT
+	// because bash ignores those.
+	if err := c.Sandbox.SignalProcess(c.ID, 0 /* PID */, syscall.SIGKILL, true /* fgProcess */); err != nil {
+		t.Fatalf("error signaling container: %v", err)
+	}
+
+	// Wait for the sandbox to exit. It should exit with a SIGKILL status.
+	wg.Wait()
+	if !ws.Signaled() {
+		t.Error("ws.Signaled() got false, want true")
+	}
+	if got, want := ws.Signal(), syscall.SIGKILL; got != want {
+		t.Errorf("ws.Signal() got %v, want %v", got, want)
+	}
+}
+
+// blockingBuffer is a thread-safe buffer that blocks when reading if the
+// buffer is empty.  It implements io.ReadWriter.
+type blockingBuffer struct {
+	// A send to readCh indicates that a previously empty buffer now has
+	// data for reading.
+	readCh chan struct{}
+
+	// mu protects buf.
+	mu  sync.Mutex
+	buf bytes.Buffer
+}
+
+func newBlockingBuffer() *blockingBuffer {
+	return &blockingBuffer{
+		readCh: make(chan struct{}, 1),
+	}
+}
+
+// Write implements Writer.Write.
+func (bb *blockingBuffer) Write(p []byte) (int, error) {
+	bb.mu.Lock()
+	defer bb.mu.Unlock()
+	l := bb.buf.Len()
+	n, err := bb.buf.Write(p)
+	if l == 0 && n > 0 {
+		// New data!
+		bb.readCh <- struct{}{}
+	}
+	return n, err
+}
+
+// Read implements Reader.Read. It will block until data is available.
+func (bb *blockingBuffer) Read(p []byte) (int, error) {
+	for {
+		bb.mu.Lock()
+		n, err := bb.buf.Read(p)
+		if n > 0 || err != io.EOF {
+			if bb.buf.Len() == 0 {
+				// Reset the readCh.
+				select {
+				case <-bb.readCh:
+				default:
+				}
+			}
+			bb.mu.Unlock()
+			return n, err
+		}
+		bb.mu.Unlock()
+
+		// Wait for new data.
+		<-bb.readCh
+	}
+}
+
+// Reset resets the buffer.
+func (bb *blockingBuffer) Reset() {
+	bb.mu.Lock()
+	defer bb.mu.Unlock()
+	bb.buf.Reset()
+	// Reset the readCh.
+	select {
+	case <-bb.readCh:
+	default:
+	}
+}
diff --git a/runsc/container/container.go b/runsc/container/container.go
new file mode 100644
index 000000000..8539f252d
--- /dev/null
+++ b/runsc/container/container.go
@@ -0,0 +1,1170 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package container creates and manipulates containers.
+package container
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"regexp"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/sighandling"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/sandbox"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// validateID validates the container id.
+func validateID(id string) error {
+	// See libcontainer/factory_linux.go.
+	idRegex := regexp.MustCompile(`^[\w+-\.]+$`)
+	if !idRegex.MatchString(id) {
+		return fmt.Errorf("invalid container id: %v", id)
+	}
+	return nil
+}
+
+// Container represents a containerized application. When running, the
+// container is associated with a single Sandbox.
+//
+// Container metadata can be saved and loaded to disk. Within a root directory,
+// we maintain subdirectories for each container named with the container id.
+// The container metadata is stored as a json within the container directory
+// in a file named "meta.json". This metadata format is defined by us and is
+// not part of the OCI spec.
+//
+// Containers must write their metadata files after any change to their internal
+// states. The entire container directory is deleted when the container is
+// destroyed.
+//
+// When the container is stopped, all processes that belong to the container
+// must be stopped before Destroy() returns. containerd makes roughly the
+// following calls to stop a container:
+//   - First it attempts to kill the container process with
+//     'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a
+//     separate thread, it's waiting on the container. As soon as the wait
+//     returns, it moves on to the next step:
+//   - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to
+//     the container. 'kill --all SIGKILL' waits for all processes before
+//     returning.
+//   - Containerd waits for stdin, stdout and stderr to drain and be closed.
+//   - It calls 'runsc delete'. runc implementation kills --all SIGKILL once
+//     again just to be sure, waits, and then proceeds with remaining teardown.
+//
+type Container struct {
+	// ID is the container ID.
+	ID string `json:"id"`
+
+	// Spec is the OCI runtime spec that configures this container.
+	Spec *specs.Spec `json:"spec"`
+
+	// BundleDir is the directory containing the container bundle.
+	BundleDir string `json:"bundleDir"`
+
+	// CreatedAt is the time the container was created.
+	CreatedAt time.Time `json:"createdAt"`
+
+	// Owner is the container owner.
+	Owner string `json:"owner"`
+
+	// ConsoleSocket is the path to a unix domain socket that will receive
+	// the console FD.
+	ConsoleSocket string `json:"consoleSocket"`
+
+	// Status is the current container Status.
+	Status Status `json:"status"`
+
+	// GoferPid is the PID of the gofer running along side the sandbox. May
+	// be 0 if the gofer has been killed.
+	GoferPid int `json:"goferPid"`
+
+	// Sandbox is the sandbox this container is running in. It's set when the
+	// container is created and reset when the sandbox is destroyed.
+	Sandbox *sandbox.Sandbox `json:"sandbox"`
+
+	// Saver handles load from/save to the state file safely from multiple
+	// processes.
+	Saver StateFile `json:"saver"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
+	// goferIsChild is set if a gofer process is a child of the current process.
+	//
+	// This field isn't saved to json, because only a creator of a gofer
+	// process will have it as a child process.
+	goferIsChild bool
+}
+
+// loadSandbox loads all containers that belong to the sandbox with the given
+// ID.
+func loadSandbox(rootDir, id string) ([]*Container, error) {
+	cids, err := List(rootDir)
+	if err != nil {
+		return nil, err
+	}
+
+	// Load the container metadata.
+	var containers []*Container
+	for _, cid := range cids {
+		container, err := Load(rootDir, cid)
+		if err != nil {
+			// Container file may not exist if it raced with creation/deletion or
+			// directory was left behind. Load provides a snapshot in time, so it's
+			// fine to skip it.
+			if os.IsNotExist(err) {
+				continue
+			}
+			return nil, fmt.Errorf("loading container %q: %v", id, err)
+		}
+		if container.Sandbox.ID == id {
+			containers = append(containers, container)
+		}
+	}
+	return containers, nil
+}
+
+// Load loads a container with the given id from a metadata file. partialID may
+// be an abbreviation of the full container id, in which case Load loads the
+// container to which id unambiguously refers to. Returns ErrNotExist if
+// container doesn't exist.
+func Load(rootDir, partialID string) (*Container, error) {
+	log.Debugf("Load container %q %q", rootDir, partialID)
+	if err := validateID(partialID); err != nil {
+		return nil, fmt.Errorf("validating id: %v", err)
+	}
+
+	id, err := findContainerID(rootDir, partialID)
+	if err != nil {
+		// Preserve error so that callers can distinguish 'not found' errors.
+		return nil, err
+	}
+
+	state := StateFile{
+		RootDir: rootDir,
+		ID:      id,
+	}
+	defer state.close()
+
+	c := &Container{}
+	if err := state.load(c); err != nil {
+		if os.IsNotExist(err) {
+			// Preserve error so that callers can distinguish 'not found' errors.
+			return nil, err
+		}
+		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
+	}
+
+	// If the status is "Running" or "Created", check that the sandbox
+	// process still exists, and set it to Stopped if it does not.
+	//
+	// This is inherently racy.
+	if c.Status == Running || c.Status == Created {
+		// Check if the sandbox process is still running.
+		if !c.isSandboxRunning() {
+			// Sandbox no longer exists, so this container definitely does not exist.
+			c.changeStatus(Stopped)
+		} else if c.Status == Running {
+			// Container state should reflect the actual state of the application, so
+			// we don't consider gofer process here.
+			if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
+				c.changeStatus(Stopped)
+			}
+		}
+	}
+
+	return c, nil
+}
+
+func findContainerID(rootDir, partialID string) (string, error) {
+	// Check whether the id fully specifies an existing container.
+	stateFile := buildStatePath(rootDir, partialID)
+	if _, err := os.Stat(stateFile); err == nil {
+		return partialID, nil
+	}
+
+	// Now see whether id could be an abbreviation of exactly 1 of the
+	// container ids. If id is ambiguous (it could match more than 1
+	// container), it is an error.
+	ids, err := List(rootDir)
+	if err != nil {
+		return "", err
+	}
+	rv := ""
+	for _, id := range ids {
+		if strings.HasPrefix(id, partialID) {
+			if rv != "" {
+				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
+			}
+			rv = id
+		}
+	}
+	if rv == "" {
+		return "", os.ErrNotExist
+	}
+	log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv)
+	return rv, nil
+}
+
+// Args is used to configure a new container.
+type Args struct {
+	// ID is the container unique identifier.
+	ID string
+
+	// Spec is the OCI spec that describes the container.
+	Spec *specs.Spec
+
+	// BundleDir is the directory containing the container bundle.
+	BundleDir string
+
+	// ConsoleSocket is the path to a unix domain socket that will receive
+	// the console FD. It may be empty.
+	ConsoleSocket string
+
+	// PIDFile is the filename where the container's root process PID will be
+	// written to. It may be empty.
+	PIDFile string
+
+	// UserLog is the filename to send user-visible logs to. It may be empty.
+	//
+	// It only applies for the init container.
+	UserLog string
+
+	// Attached indicates that the sandbox lifecycle is attached with the caller.
+	// If the caller exits, the sandbox should exit too.
+	//
+	// It only applies for the init container.
+	Attached bool
+}
+
+// New creates the container in a new Sandbox process, unless the metadata
+// indicates that an existing Sandbox should be used. The caller must call
+// Destroy() on the container.
+func New(conf *boot.Config, args Args) (*Container, error) {
+	log.Debugf("Create container %q in root dir: %s", args.ID, conf.RootDir)
+	if err := validateID(args.ID); err != nil {
+		return nil, err
+	}
+
+	if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
+		return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err)
+	}
+
+	c := &Container{
+		ID:            args.ID,
+		Spec:          args.Spec,
+		ConsoleSocket: args.ConsoleSocket,
+		BundleDir:     args.BundleDir,
+		Status:        Creating,
+		CreatedAt:     time.Now(),
+		Owner:         os.Getenv("USER"),
+		Saver: StateFile{
+			RootDir: conf.RootDir,
+			ID:      args.ID,
+		},
+	}
+	// The Cleanup object cleans up partially created containers when an error
+	// occurs. Any errors occurring during cleanup itself are ignored.
+	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+	defer cu.Clean()
+
+	// Lock the container metadata file to prevent concurrent creations of
+	// containers with the same id.
+	if err := c.Saver.lockForNew(); err != nil {
+		return nil, err
+	}
+	defer c.Saver.unlock()
+
+	// If the metadata annotations indicate that this container should be
+	// started in an existing sandbox, we must do so. The metadata will
+	// indicate the ID of the sandbox, which is the same as the ID of the
+	// init container in the sandbox.
+	if isRoot(args.Spec) {
+		log.Debugf("Creating new sandbox for container %q", args.ID)
+
+		// Create and join cgroup before processes are created to ensure they are
+		// part of the cgroup from the start (and all their children processes).
+		cg, err := cgroup.New(args.Spec)
+		if err != nil {
+			return nil, err
+		}
+		if cg != nil {
+			// If there is cgroup config, install it before creating sandbox process.
+			if err := cg.Install(args.Spec.Linux.Resources); err != nil {
+				return nil, fmt.Errorf("configuring cgroup: %v", err)
+			}
+		}
+		if err := runInCgroup(cg, func() error {
+			ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir)
+			if err != nil {
+				return err
+			}
+
+			// Start a new sandbox for this container. Any errors after this point
+			// must destroy the container.
+			sandArgs := &sandbox.Args{
+				ID:            args.ID,
+				Spec:          args.Spec,
+				BundleDir:     args.BundleDir,
+				ConsoleSocket: args.ConsoleSocket,
+				UserLog:       args.UserLog,
+				IOFiles:       ioFiles,
+				MountsFile:    specFile,
+				Cgroup:        cg,
+				Attached:      args.Attached,
+			}
+			sand, err := sandbox.New(conf, sandArgs)
+			if err != nil {
+				return err
+			}
+			c.Sandbox = sand
+			return nil
+
+		}); err != nil {
+			return nil, err
+		}
+	} else {
+		// This is sort of confusing. For a sandbox with a root
+		// container and a child container in it, runsc sees:
+		// * A container struct whose sandbox ID is equal to the
+		//   container ID. This is the root container that is tied to
+		//   the creation of the sandbox.
+		// * A container struct whose sandbox ID is equal to the above
+		//   container/sandbox ID, but that has a different container
+		//   ID. This is the child container.
+		sbid, ok := specutils.SandboxID(args.Spec)
+		if !ok {
+			return nil, fmt.Errorf("no sandbox ID found when creating container")
+		}
+		log.Debugf("Creating new container %q in sandbox %q", c.ID, sbid)
+
+		// Find the sandbox associated with this ID.
+		sb, err := Load(conf.RootDir, sbid)
+		if err != nil {
+			return nil, err
+		}
+		c.Sandbox = sb.Sandbox
+		if err := c.Sandbox.CreateContainer(c.ID); err != nil {
+			return nil, err
+		}
+	}
+	c.changeStatus(Created)
+
+	// Save the metadata file.
+	if err := c.saveLocked(); err != nil {
+		return nil, err
+	}
+
+	// Write the PID file. Containerd considers the create complete after
+	// this file is created, so it must be the last thing we do.
+	if args.PIDFile != "" {
+		if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil {
+			return nil, fmt.Errorf("error writing PID file: %v", err)
+		}
+	}
+
+	cu.Release()
+	return c, nil
+}
+
+// Start starts running the containerized process inside the sandbox.
+func (c *Container) Start(conf *boot.Config) error {
+	log.Debugf("Start container %q", c.ID)
+
+	if err := c.Saver.lock(); err != nil {
+		return err
+	}
+	unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+	defer unlock.Clean()
+
+	if err := c.requireStatus("start", Created); err != nil {
+		return err
+	}
+
+	// "If any prestart hook fails, the runtime MUST generate an error,
+	// stop and destroy the container" -OCI spec.
+	if c.Spec.Hooks != nil {
+		if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
+			return err
+		}
+	}
+
+	if isRoot(c.Spec) {
+		if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
+			return err
+		}
+	} else {
+		// Join cgroup to start gofer process to ensure it's part of the cgroup from
+		// the start (and all their children processes).
+		if err := runInCgroup(c.Sandbox.Cgroup, func() error {
+			// Create the gofer process.
+			ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+			if err != nil {
+				return err
+			}
+			defer mountsFile.Close()
+
+			cleanMounts, err := specutils.ReadMounts(mountsFile)
+			if err != nil {
+				return fmt.Errorf("reading mounts file: %v", err)
+			}
+			c.Spec.Mounts = cleanMounts
+
+			return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles)
+		}); err != nil {
+			return err
+		}
+	}
+
+	// "If any poststart hook fails, the runtime MUST log a warning, but
+	// the remaining hooks and lifecycle continue as if the hook had
+	// succeeded" -OCI spec.
+	if c.Spec.Hooks != nil {
+		executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
+	}
+
+	c.changeStatus(Running)
+	if err := c.saveLocked(); err != nil {
+		return err
+	}
+
+	// Release lock before adjusting OOM score because the lock is acquired there.
+	unlock.Clean()
+
+	// Adjust the oom_score_adj for sandbox. This must be done after saveLocked().
+	if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Saver.RootDir, false); err != nil {
+		return err
+	}
+
+	// Set container's oom_score_adj to the gofer since it is dedicated to
+	// the container, in case the gofer uses up too much memory.
+	return c.adjustGoferOOMScoreAdj()
+}
+
+// Restore takes a container and replaces its kernel and file system
+// to restore a container from its state file.
+func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
+	log.Debugf("Restore container %q", c.ID)
+	if err := c.Saver.lock(); err != nil {
+		return err
+	}
+	defer c.Saver.unlock()
+
+	if err := c.requireStatus("restore", Created); err != nil {
+		return err
+	}
+
+	// "If any prestart hook fails, the runtime MUST generate an error,
+	// stop and destroy the container" -OCI spec.
+	if c.Spec.Hooks != nil {
+		if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
+			return err
+		}
+	}
+
+	if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil {
+		return err
+	}
+	c.changeStatus(Running)
+	return c.saveLocked()
+}
+
+// Run is a helper that calls Create + Start + Wait.
+func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
+	log.Debugf("Run container %q in root dir: %s", args.ID, conf.RootDir)
+	c, err := New(conf, args)
+	if err != nil {
+		return 0, fmt.Errorf("creating container: %v", err)
+	}
+	// Clean up partially created container if an error occurs.
+	// Any errors returned by Destroy() itself are ignored.
+	cu := specutils.MakeCleanup(func() {
+		c.Destroy()
+	})
+	defer cu.Clean()
+
+	if conf.RestoreFile != "" {
+		log.Debugf("Restore: %v", conf.RestoreFile)
+		if err := c.Restore(args.Spec, conf, conf.RestoreFile); err != nil {
+			return 0, fmt.Errorf("starting container: %v", err)
+		}
+	} else {
+		if err := c.Start(conf); err != nil {
+			return 0, fmt.Errorf("starting container: %v", err)
+		}
+	}
+	if args.Attached {
+		return c.Wait()
+	}
+	cu.Release()
+	return 0, nil
+}
+
+// Execute runs the specified command in the container. It returns the PID of
+// the newly created process.
+func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
+	log.Debugf("Execute in container %q, args: %+v", c.ID, args)
+	if err := c.requireStatus("execute in", Created, Running); err != nil {
+		return 0, err
+	}
+	args.ContainerID = c.ID
+	return c.Sandbox.Execute(args)
+}
+
+// Event returns events for the container.
+func (c *Container) Event() (*boot.Event, error) {
+	log.Debugf("Getting events for container %q", c.ID)
+	if err := c.requireStatus("get events for", Created, Running, Paused); err != nil {
+		return nil, err
+	}
+	return c.Sandbox.Event(c.ID)
+}
+
+// SandboxPid returns the Pid of the sandbox the container is running in, or -1 if the
+// container is not running.
+func (c *Container) SandboxPid() int {
+	if err := c.requireStatus("get PID", Created, Running, Paused); err != nil {
+		return -1
+	}
+	return c.Sandbox.Pid
+}
+
+// Wait waits for the container to exit, and returns its WaitStatus.
+// Call to wait on a stopped container is needed to retrieve the exit status
+// and wait returns immediately.
+func (c *Container) Wait() (syscall.WaitStatus, error) {
+	log.Debugf("Wait on container %q", c.ID)
+	return c.Sandbox.Wait(c.ID)
+}
+
+// WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
+// returns its WaitStatus.
+func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
+	log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
+	if !c.isSandboxRunning() {
+		return 0, fmt.Errorf("sandbox is not running")
+	}
+	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
+}
+
+// WaitPID waits for process 'pid' in the container's PID namespace and returns
+// its WaitStatus.
+func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
+	log.Debugf("Wait on PID %d in container %q", pid, c.ID)
+	if !c.isSandboxRunning() {
+		return 0, fmt.Errorf("sandbox is not running")
+	}
+	return c.Sandbox.WaitPID(c.ID, pid)
+}
+
+// SignalContainer sends the signal to the container. If all is true and signal
+// is SIGKILL, then waits for all processes to exit before returning.
+// SignalContainer returns an error if the container is already stopped.
+// TODO(b/113680494): Distinguish different error types.
+func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
+	log.Debugf("Signal container %q: %v", c.ID, sig)
+	// Signaling container in Stopped state is allowed. When all=false,
+	// an error will be returned anyway; when all=true, this allows
+	// sending signal to other processes inside the container even
+	// after the init process exits. This is especially useful for
+	// container cleanup.
+	if err := c.requireStatus("signal", Running, Stopped); err != nil {
+		return err
+	}
+	if !c.isSandboxRunning() {
+		return fmt.Errorf("sandbox is not running")
+	}
+	return c.Sandbox.SignalContainer(c.ID, sig, all)
+}
+
+// SignalProcess sends sig to a specific process in the container.
+func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error {
+	log.Debugf("Signal process %d in container %q: %v", pid, c.ID, sig)
+	if err := c.requireStatus("signal a process inside", Running); err != nil {
+		return err
+	}
+	if !c.isSandboxRunning() {
+		return fmt.Errorf("sandbox is not running")
+	}
+	return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
+}
+
+// ForwardSignals forwards all signals received by the current process to the
+// container process inside the sandbox. It returns a function that will stop
+// forwarding signals.
+func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() {
+	log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+	stop := sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", sig, c.ID, pid, fgProcess)
+		if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.Signal(sig), fgProcess); err != nil {
+			log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err)
+		}
+	})
+	return func() {
+		log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+		stop()
+	}
+}
+
+// Checkpoint sends the checkpoint call to the container.
+// The statefile will be written to f, the file at the specified image-path.
+func (c *Container) Checkpoint(f *os.File) error {
+	log.Debugf("Checkpoint container %q", c.ID)
+	if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil {
+		return err
+	}
+	return c.Sandbox.Checkpoint(c.ID, f)
+}
+
+// Pause suspends the container and its kernel.
+// The call only succeeds if the container's status is created or running.
+func (c *Container) Pause() error {
+	log.Debugf("Pausing container %q", c.ID)
+	if err := c.Saver.lock(); err != nil {
+		return err
+	}
+	defer c.Saver.unlock()
+
+	if c.Status != Created && c.Status != Running {
+		return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
+	}
+
+	if err := c.Sandbox.Pause(c.ID); err != nil {
+		return fmt.Errorf("pausing container: %v", err)
+	}
+	c.changeStatus(Paused)
+	return c.saveLocked()
+}
+
+// Resume unpauses the container and its kernel.
+// The call only succeeds if the container's status is paused.
+func (c *Container) Resume() error {
+	log.Debugf("Resuming container %q", c.ID)
+	if err := c.Saver.lock(); err != nil {
+		return err
+	}
+	defer c.Saver.unlock()
+
+	if c.Status != Paused {
+		return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
+	}
+	if err := c.Sandbox.Resume(c.ID); err != nil {
+		return fmt.Errorf("resuming container: %v", err)
+	}
+	c.changeStatus(Running)
+	return c.saveLocked()
+}
+
+// State returns the metadata of the container.
+func (c *Container) State() specs.State {
+	return specs.State{
+		Version: specs.Version,
+		ID:      c.ID,
+		Status:  c.Status.String(),
+		Pid:     c.SandboxPid(),
+		Bundle:  c.BundleDir,
+	}
+}
+
+// Processes retrieves the list of processes and associated metadata inside a
+// container.
+func (c *Container) Processes() ([]*control.Process, error) {
+	if err := c.requireStatus("get processes of", Running, Paused); err != nil {
+		return nil, err
+	}
+	return c.Sandbox.Processes(c.ID)
+}
+
+// Destroy stops all processes and frees all resources associated with the
+// container.
+func (c *Container) Destroy() error {
+	log.Debugf("Destroy container %q", c.ID)
+
+	if err := c.Saver.lock(); err != nil {
+		return err
+	}
+	defer func() {
+		c.Saver.unlock()
+		c.Saver.close()
+	}()
+
+	// Stored for later use as stop() sets c.Sandbox to nil.
+	sb := c.Sandbox
+
+	// We must perform the following cleanup steps:
+	// * stop the container and gofer processes,
+	// * remove the container filesystem on the host, and
+	// * delete the container metadata directory.
+	//
+	// It's possible for one or more of these steps to fail, but we should
+	// do our best to perform all of the cleanups. Hence, we keep a slice
+	// of errors return their concatenation.
+	var errs []string
+	if err := c.stop(); err != nil {
+		err = fmt.Errorf("stopping container: %v", err)
+		log.Warningf("%v", err)
+		errs = append(errs, err.Error())
+	}
+
+	if err := c.Saver.destroy(); err != nil {
+		err = fmt.Errorf("deleting container state files: %v", err)
+		log.Warningf("%v", err)
+		errs = append(errs, err.Error())
+	}
+
+	c.changeStatus(Stopped)
+
+	// Adjust oom_score_adj for the sandbox. This must be done after the container
+	// is stopped and the directory at c.Root is removed. Adjustment can be
+	// skipped if the root container is exiting, because it brings down the entire
+	// sandbox.
+	//
+	// Use 'sb' to tell whether it has been executed before because Destroy must
+	// be idempotent.
+	if sb != nil && !isRoot(c.Spec) {
+		if err := adjustSandboxOOMScoreAdj(sb, c.Saver.RootDir, true); err != nil {
+			errs = append(errs, err.Error())
+		}
+	}
+
+	// "If any poststop hook fails, the runtime MUST log a warning, but the
+	// remaining hooks and lifecycle continue as if the hook had
+	// succeeded" - OCI spec.
+	//
+	// Based on the OCI, "The post-stop hooks MUST be called after the container
+	// is deleted but before the delete operation returns"
+	// Run it here to:
+	// 1) Conform to the OCI.
+	// 2) Make sure it only runs once, because the root has been deleted, the
+	// container can't be loaded again.
+	if c.Spec.Hooks != nil {
+		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
+	}
+
+	if len(errs) == 0 {
+		return nil
+	}
+	return fmt.Errorf(strings.Join(errs, "\n"))
+}
+
+// saveLocked saves the container metadata to a file.
+//
+// Precondition: container must be locked with container.lock().
+func (c *Container) saveLocked() error {
+	log.Debugf("Save container %q", c.ID)
+	if err := c.Saver.saveLocked(c); err != nil {
+		return fmt.Errorf("saving container metadata: %v", err)
+	}
+	return nil
+}
+
+// stop stops the container (for regular containers) or the sandbox (for
+// root containers), and waits for the container or sandbox and the gofer
+// to stop. If any of them doesn't stop before timeout, an error is returned.
+func (c *Container) stop() error {
+	var cgroup *cgroup.Cgroup
+
+	if c.Sandbox != nil {
+		log.Debugf("Destroying container %q", c.ID)
+		if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
+			return fmt.Errorf("destroying container %q: %v", c.ID, err)
+		}
+		// Only uninstall cgroup for sandbox stop.
+		if c.Sandbox.IsRootContainer(c.ID) {
+			cgroup = c.Sandbox.Cgroup
+		}
+		// Only set sandbox to nil after it has been told to destroy the container.
+		c.Sandbox = nil
+	}
+
+	// Try killing gofer if it does not exit with container.
+	if c.GoferPid != 0 {
+		log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
+		if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+			// The gofer may already be stopped, log the error.
+			log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err)
+		}
+	}
+
+	if err := c.waitForStopped(); err != nil {
+		return err
+	}
+
+	// Gofer is running in cgroups, so Cgroup.Uninstall has to be called after it.
+	if cgroup != nil {
+		if err := cgroup.Uninstall(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (c *Container) waitForStopped() error {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+	op := func() error {
+		if c.isSandboxRunning() {
+			if err := c.SignalContainer(syscall.Signal(0), false); err == nil {
+				return fmt.Errorf("container is still running")
+			}
+		}
+		if c.GoferPid == 0 {
+			return nil
+		}
+		if c.goferIsChild {
+			// The gofer process is a child of the current process,
+			// so we can wait it and collect its zombie.
+			wpid, err := syscall.Wait4(int(c.GoferPid), nil, syscall.WNOHANG, nil)
+			if err != nil {
+				return fmt.Errorf("error waiting the gofer process: %v", err)
+			}
+			if wpid == 0 {
+				return fmt.Errorf("gofer is still running")
+			}
+
+		} else if err := syscall.Kill(c.GoferPid, 0); err == nil {
+			return fmt.Errorf("gofer is still running")
+		}
+		c.GoferPid = 0
+		return nil
+	}
+	return backoff.Retry(op, b)
+}
+
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) {
+	// Start with the general config flags.
+	args := conf.ToFlags()
+
+	var goferEnds []*os.File
+
+	// nextFD is the next available file descriptor for the gofer process.
+	// It starts at 3 because 0-2 are used by stdin/stdout/stderr.
+	nextFD := 3
+
+	if conf.LogFilename != "" {
+		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		if err != nil {
+			return nil, nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
+		}
+		defer logFile.Close()
+		goferEnds = append(goferEnds, logFile)
+		args = append(args, "--log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	if conf.DebugLog != "" {
+		test := ""
+		if len(conf.TestOnlyTestNameEnv) != 0 {
+			// Fetch test name if one is provided and the test only flag was set.
+			if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+				test = t
+			}
+		}
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer", test)
+		if err != nil {
+			return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
+		}
+		defer debugLogFile.Close()
+		goferEnds = append(goferEnds, debugLogFile)
+		args = append(args, "--debug-log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	args = append(args, "gofer", "--bundle", bundleDir)
+	if conf.Overlay {
+		args = append(args, "--panic-on-write=true")
+	}
+
+	// Open the spec file to donate to the sandbox.
+	specFile, err := specutils.OpenSpec(bundleDir)
+	if err != nil {
+		return nil, nil, fmt.Errorf("opening spec file: %v", err)
+	}
+	defer specFile.Close()
+	goferEnds = append(goferEnds, specFile)
+	args = append(args, "--spec-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
+	// Create pipe that allows gofer to send mount list to sandbox after all paths
+	// have been resolved.
+	mountsSand, mountsGofer, err := os.Pipe()
+	if err != nil {
+		return nil, nil, err
+	}
+	defer mountsGofer.Close()
+	goferEnds = append(goferEnds, mountsGofer)
+	args = append(args, fmt.Sprintf("--mounts-fd=%d", nextFD))
+	nextFD++
+
+	// Add root mount and then add any other additional mounts.
+	mountCount := 1
+	for _, m := range spec.Mounts {
+		if specutils.Is9PMount(m) {
+			mountCount++
+		}
+	}
+
+	sandEnds := make([]*os.File, 0, mountCount)
+	for i := 0; i < mountCount; i++ {
+		fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+		if err != nil {
+			return nil, nil, err
+		}
+		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD"))
+
+		goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD")
+		defer goferEnd.Close()
+		goferEnds = append(goferEnds, goferEnd)
+
+		args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
+		nextFD++
+	}
+
+	binPath := specutils.ExePath
+	cmd := exec.Command(binPath, args...)
+	cmd.ExtraFiles = goferEnds
+	cmd.Args[0] = "runsc-gofer"
+
+	// Enter new namespaces to isolate from the rest of the system. Don't unshare
+	// cgroup because gofer is added to a cgroup in the caller's namespace.
+	nss := []specs.LinuxNamespace{
+		{Type: specs.IPCNamespace},
+		{Type: specs.MountNamespace},
+		{Type: specs.NetworkNamespace},
+		{Type: specs.PIDNamespace},
+		{Type: specs.UTSNamespace},
+	}
+
+	// Setup any uid/gid mappings, and create or join the configured user
+	// namespace so the gofer's view of the filesystem aligns with the
+	// users in the sandbox.
+	userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
+	nss = append(nss, userNS...)
+	specutils.SetUIDGIDMappings(cmd, spec)
+	if len(userNS) != 0 {
+		// We need to set UID and GID to have capabilities in a new user namespace.
+		cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
+	}
+
+	// Start the gofer in the given namespace.
+	log.Debugf("Starting gofer: %s %v", binPath, args)
+	if err := specutils.StartInNS(cmd, nss); err != nil {
+		return nil, nil, fmt.Errorf("Gofer: %v", err)
+	}
+	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
+	c.GoferPid = cmd.Process.Pid
+	c.goferIsChild = true
+	return sandEnds, mountsSand, nil
+}
+
+// changeStatus transitions from one status to another ensuring that the
+// transition is valid.
+func (c *Container) changeStatus(s Status) {
+	switch s {
+	case Creating:
+		// Initial state, never transitions to it.
+		panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+
+	case Created:
+		if c.Status != Creating {
+			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+		}
+		if c.Sandbox == nil {
+			panic("sandbox cannot be nil")
+		}
+
+	case Paused:
+		if c.Status != Running {
+			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+		}
+		if c.Sandbox == nil {
+			panic("sandbox cannot be nil")
+		}
+
+	case Running:
+		if c.Status != Created && c.Status != Paused {
+			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+		}
+		if c.Sandbox == nil {
+			panic("sandbox cannot be nil")
+		}
+
+	case Stopped:
+		if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped {
+			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+		}
+
+	default:
+		panic(fmt.Sprintf("invalid new state: %v", s))
+	}
+	c.Status = s
+}
+
+func (c *Container) isSandboxRunning() bool {
+	return c.Sandbox != nil && c.Sandbox.IsRunning()
+}
+
+func (c *Container) requireStatus(action string, statuses ...Status) error {
+	for _, s := range statuses {
+		if c.Status == s {
+			return nil
+		}
+	}
+	return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
+}
+
+func isRoot(spec *specs.Spec) bool {
+	return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer
+}
+
+// runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
+// it in the current context.
+func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
+	if cg == nil {
+		return fn()
+	}
+	restore, err := cg.Join()
+	defer restore()
+	if err != nil {
+		return err
+	}
+	return fn()
+}
+
+// adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer.
+func (c *Container) adjustGoferOOMScoreAdj() error {
+	if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil {
+		return nil
+	}
+	return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj)
+}
+
+// adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox.
+// oom_score_adj is set to the lowest oom_score_adj among the containers
+// running in the sandbox.
+//
+// TODO(gvisor.dev/issue/238): This call could race with other containers being
+// created at the same time and end up setting the wrong oom_score_adj to the
+// sandbox. Use rpc client to synchronize.
+func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) error {
+	containers, err := loadSandbox(rootDir, s.ID)
+	if err != nil {
+		return fmt.Errorf("loading sandbox containers: %v", err)
+	}
+
+	// Do nothing if the sandbox has been terminated.
+	if len(containers) == 0 {
+		return nil
+	}
+
+	// Get the lowest score for all containers.
+	var lowScore int
+	scoreFound := false
+	if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified {
+		// This is a single-container sandbox. Set the oom_score_adj to
+		// the value specified in the OCI bundle.
+		if containers[0].Spec.Process.OOMScoreAdj != nil {
+			scoreFound = true
+			lowScore = *containers[0].Spec.Process.OOMScoreAdj
+		}
+	} else {
+		for _, container := range containers {
+			// Special multi-container support for CRI. Ignore the root
+			// container when calculating oom_score_adj for the sandbox because
+			// it is the infrastructure (pause) container and always has a very
+			// low oom_score_adj.
+			//
+			// We will use OOMScoreAdj in the single-container case where the
+			// containerd container-type annotation is not present.
+			if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox {
+				continue
+			}
+
+			if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+				scoreFound = true
+				lowScore = *container.Spec.Process.OOMScoreAdj
+			}
+		}
+	}
+
+	// If the container is destroyed and remaining containers have no
+	// oomScoreAdj specified then we must revert to the oom_score_adj of the
+	// parent process.
+	if !scoreFound && destroy {
+		ppid, err := specutils.GetParentPid(s.Pid)
+		if err != nil {
+			return fmt.Errorf("getting parent pid of sandbox pid %d: %v", s.Pid, err)
+		}
+		pScore, err := specutils.GetOOMScoreAdj(ppid)
+		if err != nil {
+			return fmt.Errorf("getting oom_score_adj of parent %d: %v", ppid, err)
+		}
+
+		scoreFound = true
+		lowScore = pScore
+	}
+
+	// Only set oom_score_adj if one of the containers has oom_score_adj set
+	// in the OCI bundle. If not, we need to inherit the parent process's
+	// oom_score_adj.
+	// See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process
+	if !scoreFound {
+		return nil
+	}
+
+	// Set the lowest of all containers oom_score_adj to the sandbox.
+	return setOOMScoreAdj(s.Pid, lowScore)
+}
+
+// setOOMScoreAdj sets oom_score_adj to the given value for the given PID.
+// /proc must be available and mounted read-write. scoreAdj should be between
+// -1000 and 1000. It's a noop if the process has already exited.
+func setOOMScoreAdj(pid int, scoreAdj int) error {
+	f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644)
+	if err != nil {
+		// Ignore NotExist errors because it can race with process exit.
+		if os.IsNotExist(err) {
+			log.Warningf("Process (%d) not found setting oom_score_adj", pid)
+			return nil
+		}
+		return err
+	}
+	defer f.Close()
+	if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil {
+		if errors.Is(err, syscall.ESRCH) {
+			log.Warningf("Process (%d) exited while setting oom_score_adj", pid)
+			return nil
+		}
+		return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err)
+	}
+	return nil
+}
diff --git a/runsc/container/container_norace_test.go b/runsc/container/container_norace_test.go
new file mode 100644
index 000000000..838c1e20a
--- /dev/null
+++ b/runsc/container/container_norace_test.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !race
+
+package container
+
+// Allow both kvm and ptrace for non-race builds.
+var platformOptions = []configOption{ptrace, kvm}
diff --git a/runsc/container/container_race_test.go b/runsc/container/container_race_test.go
new file mode 100644
index 000000000..9fb4c4fc0
--- /dev/null
+++ b/runsc/container/container_race_test.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package container
+
+// Only enabled ptrace with race builds.
+var platformOptions = []configOption{ptrace}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
new file mode 100644
index 000000000..acf988aa0
--- /dev/null
+++ b/runsc/container/container_test.go
@@ -0,0 +1,2282 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"path"
+	"path/filepath"
+	"reflect"
+	"strconv"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/boot/platforms"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// waitForProcessList waits for the given process list to show up in the container.
+func waitForProcessList(cont *Container, want []*control.Process) error {
+	cb := func() error {
+		got, err := cont.Processes()
+		if err != nil {
+			err = fmt.Errorf("error getting process data from container: %v", err)
+			return &backoff.PermanentError{Err: err}
+		}
+		if r, err := procListsEqual(got, want); !r {
+			return fmt.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(got), procListToString(want), err)
+		}
+		return nil
+	}
+	// Gives plenty of time as tests can run slow under --race.
+	return testutil.Poll(cb, 30*time.Second)
+}
+
+func waitForProcessCount(cont *Container, want int) error {
+	cb := func() error {
+		pss, err := cont.Processes()
+		if err != nil {
+			err = fmt.Errorf("error getting process data from container: %v", err)
+			return &backoff.PermanentError{Err: err}
+		}
+		if got := len(pss); got != want {
+			log.Infof("Waiting for process count to reach %d. Current: %d", want, got)
+			return fmt.Errorf("wrong process count, got: %d, want: %d", got, want)
+		}
+		return nil
+	}
+	// Gives plenty of time as tests can run slow under --race.
+	return testutil.Poll(cb, 30*time.Second)
+}
+
+func blockUntilWaitable(pid int) error {
+	_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
+		var err error
+		_, _, err1 := syscall.Syscall6(syscall.SYS_WAITID, 1, uintptr(pid), 0, syscall.WEXITED|syscall.WNOWAIT, 0, 0)
+		if err1 != 0 {
+			err = err1
+		}
+		return 0, 0, err
+	})
+	return err
+}
+
+// procListsEqual is used to check whether 2 Process lists are equal for all
+// implemented fields.
+func procListsEqual(got, want []*control.Process) (bool, error) {
+	if len(got) != len(want) {
+		return false, nil
+	}
+	for i := range got {
+		pd1 := got[i]
+		pd2 := want[i]
+		// Zero out timing dependant fields.
+		pd1.Time = ""
+		pd1.STime = ""
+		pd1.C = 0
+		// Ignore TTY field too, since it's not relevant in the cases
+		// where we use this method. Tests that care about the TTY
+		// field should check for it themselves.
+		pd1.TTY = ""
+		pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
+		if err != nil {
+			return false, err
+		}
+		pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
+		if err != nil {
+			return false, err
+		}
+		if pd1Json != pd2Json {
+			return false, nil
+		}
+	}
+	return true, nil
+}
+
+func procListToString(pl []*control.Process) string {
+	strs := make([]string, 0, len(pl))
+	for _, p := range pl {
+		strs = append(strs, fmt.Sprintf("%+v", p))
+	}
+	return fmt.Sprintf("[%s]", strings.Join(strs, ","))
+}
+
+// createWriteableOutputFile creates an output file that can be read and
+// written to in the sandbox.
+func createWriteableOutputFile(path string) (*os.File, error) {
+	outputFile, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+	if err != nil {
+		return nil, fmt.Errorf("error creating file: %q, %v", path, err)
+	}
+
+	// Chmod to allow writing after umask.
+	if err := outputFile.Chmod(0666); err != nil {
+		return nil, fmt.Errorf("error chmoding file: %q, %v", path, err)
+	}
+	return outputFile, nil
+}
+
+func waitForFileNotEmpty(f *os.File) error {
+	op := func() error {
+		fi, err := f.Stat()
+		if err != nil {
+			return err
+		}
+		if fi.Size() == 0 {
+			return fmt.Errorf("file %q is empty", f.Name())
+		}
+		return nil
+	}
+
+	return testutil.Poll(op, 30*time.Second)
+}
+
+func waitForFileExist(path string) error {
+	op := func() error {
+		if _, err := os.Stat(path); os.IsNotExist(err) {
+			return err
+		}
+		return nil
+	}
+
+	return testutil.Poll(op, 30*time.Second)
+}
+
+// readOutputNum reads a file at given filepath and returns the int at the
+// requested position.
+func readOutputNum(file string, position int) (int, error) {
+	f, err := os.Open(file)
+	if err != nil {
+		return 0, fmt.Errorf("error opening file: %q, %v", file, err)
+	}
+
+	// Ensure that there is content in output file.
+	if err := waitForFileNotEmpty(f); err != nil {
+		return 0, fmt.Errorf("error waiting for output file: %v", err)
+	}
+
+	b, err := ioutil.ReadAll(f)
+	if err != nil {
+		return 0, fmt.Errorf("error reading file: %v", err)
+	}
+	if len(b) == 0 {
+		return 0, fmt.Errorf("error no content was read")
+	}
+
+	// Strip leading null bytes caused by file offset not being 0 upon restore.
+	b = bytes.Trim(b, "\x00")
+	nums := strings.Split(string(b), "\n")
+
+	if position >= len(nums) {
+		return 0, fmt.Errorf("position %v is not within the length of content %v", position, nums)
+	}
+	if position == -1 {
+		// Expectation of newline at the end of last position.
+		position = len(nums) - 2
+	}
+	num, err := strconv.Atoi(nums[position])
+	if err != nil {
+		return 0, fmt.Errorf("error getting number from file: %v", err)
+	}
+	return num, nil
+}
+
+// run starts the sandbox and waits for it to exit, checking that the
+// application succeeded.
+func run(spec *specs.Spec, conf *boot.Config) error {
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		return fmt.Errorf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create, start and wait for the container.
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+		Attached:  true,
+	}
+	ws, err := Run(conf, args)
+	if err != nil {
+		return fmt.Errorf("running container: %v", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		return fmt.Errorf("container failed, waitStatus: %v", ws)
+	}
+	return nil
+}
+
+type configOption int
+
+const (
+	overlay configOption = iota
+	ptrace
+	kvm
+	nonExclusiveFS
+)
+
+var (
+	noOverlay = append(platformOptions, nonExclusiveFS)
+	all       = append(noOverlay, overlay)
+)
+
+// configs generates different configurations to run tests.
+func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
+	// Always load the default config.
+	cs := make(map[string]*boot.Config)
+	cs["default"] = testutil.TestConfig(t)
+
+	for _, o := range opts {
+		switch o {
+		case overlay:
+			c := testutil.TestConfig(t)
+			c.Overlay = true
+			cs["overlay"] = c
+		case ptrace:
+			c := testutil.TestConfig(t)
+			c.Platform = platforms.Ptrace
+			cs["ptrace"] = c
+		case kvm:
+			c := testutil.TestConfig(t)
+			c.Platform = platforms.KVM
+			cs["kvm"] = c
+		case nonExclusiveFS:
+			c := testutil.TestConfig(t)
+			c.FileAccess = boot.FileAccessShared
+			cs["non-exclusive"] = c
+		default:
+			panic(fmt.Sprintf("unknown config option %v", o))
+		}
+	}
+	return cs
+}
+
+func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
+	vfs1 := configs(t, opts...)
+	vfs2 := configs(t, opts...)
+
+	for key, value := range vfs2 {
+		value.VFS2 = true
+		vfs1[key+"VFS2"] = value
+	}
+
+	return vfs1
+}
+
+// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
+// It verifies after each step that the container can be loaded from disk, and
+// has the correct status.
+func TestLifecycle(t *testing.T) {
+	// Start the child reaper.
+	childReaper := &testutil.Reaper{}
+	childReaper.Start()
+	defer childReaper.Stop()
+
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			// The container will just sleep for a long time.  We will kill it before
+			// it finishes sleeping.
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+
+			rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+			}
+			// Create the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
+			if err != nil {
+				t.Fatalf("error loading container: %v", err)
+			}
+			if got, want := c.Status, Created; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
+
+			// List should return the container id.
+			ids, err := List(rootDir)
+			if err != nil {
+				t.Fatalf("error listing containers: %v", err)
+			}
+			if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) {
+				t.Errorf("container list got %v, want %v", got, want)
+			}
+
+			// Start the container.
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
+			if err != nil {
+				t.Fatalf("error loading container: %v", err)
+			}
+			if got, want := c.Status, Running; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
+
+			// Verify that "sleep 100" is running.
+			if err := waitForProcessList(c, expectedPL); err != nil {
+				t.Error(err)
+			}
+
+			// Wait on the container.
+			ch := make(chan error)
+			go func() {
+				ws, err := c.Wait()
+				if err != nil {
+					ch <- err
+				}
+				if got, want := ws.Signal(), syscall.SIGTERM; got != want {
+					ch <- fmt.Errorf("got signal %v, want %v", got, want)
+				}
+				ch <- nil
+			}()
+
+			// Wait a bit to ensure that we've started waiting on
+			// the container before we signal.
+			time.Sleep(time.Second)
+
+			// Send the container a SIGTERM which will cause it to stop.
+			if err := c.SignalContainer(syscall.SIGTERM, false); err != nil {
+				t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err)
+			}
+
+			// Wait for it to die.
+			if err := <-ch; err != nil {
+				t.Fatalf("error waiting for container: %v", err)
+			}
+
+			// Load the container from disk and check the status.
+			c, err = Load(rootDir, args.ID)
+			if err != nil {
+				t.Fatalf("error loading container: %v", err)
+			}
+			if got, want := c.Status, Stopped; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
+
+			// Destroy the container.
+			if err := c.Destroy(); err != nil {
+				t.Fatalf("error destroying container: %v", err)
+			}
+
+			// List should not return the container id.
+			ids, err = List(rootDir)
+			if err != nil {
+				t.Fatalf("error listing containers: %v", err)
+			}
+			if len(ids) != 0 {
+				t.Errorf("expected container list to be empty, but got %v", ids)
+			}
+
+			// Loading the container by id should fail.
+			if _, err = Load(rootDir, args.ID); err == nil {
+				t.Errorf("expected loading destroyed container to fail, but it did not")
+			}
+		})
+	}
+}
+
+// Test the we can execute the application with different path formats.
+func TestExePath(t *testing.T) {
+	// Create two directories that will be prepended to PATH.
+	firstPath, err := ioutil.TempDir(testutil.TmpDir(), "first")
+	if err != nil {
+		t.Fatalf("error creating temporary directory: %v", err)
+	}
+	defer os.RemoveAll(firstPath)
+	secondPath, err := ioutil.TempDir(testutil.TmpDir(), "second")
+	if err != nil {
+		t.Fatalf("error creating temporary directory: %v", err)
+	}
+	defer os.RemoveAll(secondPath)
+
+	// Create two minimal executables in the second path, two of which
+	// will be masked by files in first path.
+	for _, p := range []string{"unmasked", "masked1", "masked2"} {
+		path := filepath.Join(secondPath, p)
+		f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0777)
+		if err != nil {
+			t.Fatalf("error opening path: %v", err)
+		}
+		defer f.Close()
+		if _, err := io.WriteString(f, "#!/bin/true\n"); err != nil {
+			t.Fatalf("error writing contents: %v", err)
+		}
+	}
+
+	// Create a non-executable file in the first path which masks a healthy
+	// executable in the second.
+	nonExecutable := filepath.Join(firstPath, "masked1")
+	f2, err := os.OpenFile(nonExecutable, os.O_CREATE|os.O_EXCL, 0666)
+	if err != nil {
+		t.Fatalf("error opening file: %v", err)
+	}
+	f2.Close()
+
+	// Create a non-regular file in the first path which masks a healthy
+	// executable in the second.
+	nonRegular := filepath.Join(firstPath, "masked2")
+	if err := os.Mkdir(nonRegular, 0777); err != nil {
+		t.Fatalf("error making directory: %v", err)
+	}
+
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			for _, test := range []struct {
+				path    string
+				success bool
+			}{
+				{path: "true", success: true},
+				{path: "bin/true", success: true},
+				{path: "/bin/true", success: true},
+				{path: "thisfiledoesntexit", success: false},
+				{path: "bin/thisfiledoesntexit", success: false},
+				{path: "/bin/thisfiledoesntexit", success: false},
+
+				{path: "unmasked", success: true},
+				{path: filepath.Join(firstPath, "unmasked"), success: false},
+				{path: filepath.Join(secondPath, "unmasked"), success: true},
+
+				{path: "masked1", success: true},
+				{path: filepath.Join(firstPath, "masked1"), success: false},
+				{path: filepath.Join(secondPath, "masked1"), success: true},
+
+				{path: "masked2", success: true},
+				{path: filepath.Join(firstPath, "masked2"), success: false},
+				{path: filepath.Join(secondPath, "masked2"), success: true},
+			} {
+				t.Run(fmt.Sprintf("path=%s,success=%t", test.path, test.success), func(t *testing.T) {
+					spec := testutil.NewSpecWithArgs(test.path)
+					spec.Process.Env = []string{
+						fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")),
+					}
+
+					_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+					if err != nil {
+						t.Fatalf("exec: error setting up container: %v", err)
+					}
+					defer cleanup()
+
+					args := Args{
+						ID:        testutil.RandomContainerID(),
+						Spec:      spec,
+						BundleDir: bundleDir,
+						Attached:  true,
+					}
+					ws, err := Run(conf, args)
+
+					if test.success {
+						if err != nil {
+							t.Errorf("exec: error running container: %v", err)
+						}
+						if ws.ExitStatus() != 0 {
+							t.Errorf("exec: got exit status %v want %v", ws.ExitStatus(), 0)
+						}
+					} else {
+						if err == nil {
+							t.Errorf("exec: got: no error, want: error")
+						}
+					}
+				})
+			}
+		})
+	}
+}
+
+// Test the we can retrieve the application exit status from the container.
+func TestAppExitStatus(t *testing.T) {
+	doAppExitStatus(t, false)
+}
+
+// This is TestAppExitStatus for VFSv2.
+func TestAppExitStatusVFS2(t *testing.T) {
+	doAppExitStatus(t, true)
+}
+
+func doAppExitStatus(t *testing.T, vfs2 bool) {
+	// First container will succeed.
+	succSpec := testutil.NewSpecWithArgs("true")
+	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
+	_, bundleDir, cleanup, err := testutil.SetupContainer(succSpec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      succSpec,
+		BundleDir: bundleDir,
+		Attached:  true,
+	}
+	ws, err := Run(conf, args)
+	if err != nil {
+		t.Fatalf("error running container: %v", err)
+	}
+	if ws.ExitStatus() != 0 {
+		t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0)
+	}
+
+	// Second container exits with non-zero status.
+	wantStatus := 123
+	errSpec := testutil.NewSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus))
+
+	_, bundleDir2, cleanup2, err := testutil.SetupContainer(errSpec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup2()
+
+	args2 := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      errSpec,
+		BundleDir: bundleDir2,
+		Attached:  true,
+	}
+	ws, err = Run(conf, args2)
+	if err != nil {
+		t.Fatalf("error running container: %v", err)
+	}
+	if ws.ExitStatus() != wantStatus {
+		t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus)
+	}
+}
+
+// TestExec verifies that a container can exec a new program.
+func TestExec(t *testing.T) {
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			const uid = 343
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+				{
+					UID:     uid,
+					PID:     2,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{2},
+				},
+			}
+
+			// Verify that "sleep 100" is running.
+			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+				t.Error(err)
+			}
+
+			execArgs := &control.ExecArgs{
+				Filename:         "/bin/sleep",
+				Argv:             []string{"/bin/sleep", "5"},
+				WorkingDirectory: "/",
+				KUID:             uid,
+			}
+
+			// Verify that "sleep 100" and "sleep 5" are running
+			// after exec.  First, start running exec (whick
+			// blocks).
+			ch := make(chan error)
+			go func() {
+				exitStatus, err := cont.executeSync(execArgs)
+				if err != nil {
+					ch <- err
+				} else if exitStatus != 0 {
+					ch <- fmt.Errorf("failed with exit status: %v", exitStatus)
+				} else {
+					ch <- nil
+				}
+			}()
+
+			if err := waitForProcessList(cont, expectedPL); err != nil {
+				t.Fatalf("error waiting for processes: %v", err)
+			}
+
+			// Ensure that exec finished without error.
+			select {
+			case <-time.After(10 * time.Second):
+				t.Fatalf("container timed out waiting for exec to finish.")
+			case err := <-ch:
+				if err != nil {
+					t.Errorf("container failed to exec %v: %v", args, err)
+				}
+			}
+		})
+	}
+}
+
+// TestKillPid verifies that we can signal individual exec'd processes.
+func TestKillPid(t *testing.T) {
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			app, err := testutil.FindFile("test/cmd/test_app/test_app")
+			if err != nil {
+				t.Fatal("error finding test_app:", err)
+			}
+
+			const nProcs = 4
+			spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true")
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Verify that all processes are running.
+			if err := waitForProcessCount(cont, nProcs); err != nil {
+				t.Fatalf("timed out waiting for processes to start: %v", err)
+			}
+
+			// Kill the child process with the largest PID.
+			procs, err := cont.Processes()
+			if err != nil {
+				t.Fatalf("failed to get process list: %v", err)
+			}
+			var pid int32
+			for _, p := range procs {
+				if pid < int32(p.PID) {
+					pid = int32(p.PID)
+				}
+			}
+			if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil {
+				t.Fatalf("failed to signal process %d: %v", pid, err)
+			}
+
+			// Verify that one process is gone.
+			if err := waitForProcessCount(cont, nProcs-1); err != nil {
+				t.Fatalf("error waiting for processes: %v", err)
+			}
+
+			procs, err = cont.Processes()
+			if err != nil {
+				t.Fatalf("failed to get process list: %v", err)
+			}
+			for _, p := range procs {
+				if pid == int32(p.PID) {
+					t.Fatalf("pid %d is still alive, which should be killed", pid)
+				}
+			}
+		})
+	}
+}
+
+// TestCheckpointRestore creates a container that continuously writes successive integers
+// to a file. To test checkpoint and restore functionality, the container is
+// checkpointed and the last number printed to the file is recorded. Then, it is restored in two
+// new containers and the first number printed from these containers is checked. Both should
+// be the next consecutive number after the last number from the checkpointed container.
+func TestCheckpointRestore(t *testing.T) {
+	// Skip overlay because test requires writing to host file.
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
+			if err := os.Chmod(dir, 0777); err != nil {
+				t.Fatalf("error chmoding file: %q, %v", dir, err)
+			}
+
+			outputPath := filepath.Join(dir, "output")
+			outputFile, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile.Close()
+
+			script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath)
+			spec := testutil.NewSpecWithArgs("bash", "-c", script)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Set the image path, which is where the checkpoint image will be saved.
+			imagePath := filepath.Join(dir, "test-image-file")
+
+			// Create the image file and open for writing.
+			file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+			if err != nil {
+				t.Fatalf("error opening new file at imagePath: %v", err)
+			}
+			defer file.Close()
+
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
+
+			// Checkpoint running container; save state into new file.
+			if err := cont.Checkpoint(file); err != nil {
+				t.Fatalf("error checkpointing container to empty file: %v", err)
+			}
+			defer os.RemoveAll(imagePath)
+
+			lastNum, err := readOutputNum(outputPath, -1)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
+
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile2, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile2.Close()
+
+			// Restore into a new container.
+			args2 := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont2, err := New(conf, args2)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont2.Destroy()
+
+			if err := cont2.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
+
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile2); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
+
+			firstNum, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
+
+			// Check that lastNum is one less than firstNum and that the container picks
+			// up from where it left off.
+			if lastNum+1 != firstNum {
+				t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum)
+			}
+			cont2.Destroy()
+
+			// Restore into another container!
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile3, err := createWriteableOutputFile(outputPath)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile3.Close()
+
+			// Restore into a new container.
+			args3 := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont3, err := New(conf, args3)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont3.Destroy()
+
+			if err := cont3.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
+
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile3); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
+
+			firstNum2, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
+
+			// Check that lastNum is one less than firstNum and that the container picks
+			// up from where it left off.
+			if lastNum+1 != firstNum2 {
+				t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2)
+			}
+			cont3.Destroy()
+		})
+	}
+}
+
+// TestUnixDomainSockets checks that Checkpoint/Restore works in cases
+// with filesystem Unix Domain Socket use.
+func TestUnixDomainSockets(t *testing.T) {
+	// Skip overlay because test requires writing to host file.
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			// UDS path is limited to 108 chars for compatibility with older systems.
+			// Use '/tmp' (instead of testutil.TmpDir) to ensure the size limit is
+			// not exceeded. Assumes '/tmp' exists in the system.
+			dir, err := ioutil.TempDir("/tmp", "uds-test")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			outputPath := filepath.Join(dir, "uds_output")
+			outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile.Close()
+
+			app, err := testutil.FindFile("test/cmd/test_app/test_app")
+			if err != nil {
+				t.Fatal("error finding test_app:", err)
+			}
+
+			socketPath := filepath.Join(dir, "uds_socket")
+			defer os.Remove(socketPath)
+
+			spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath)
+			spec.Process.User = specs.User{
+				UID: uint32(os.Getuid()),
+				GID: uint32(os.Getgid()),
+			}
+			spec.Mounts = []specs.Mount{{
+				Type:        "bind",
+				Destination: dir,
+				Source:      dir,
+			}}
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Set the image path, the location where the checkpoint image will be saved.
+			imagePath := filepath.Join(dir, "test-image-file")
+
+			// Create the image file and open for writing.
+			file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
+			if err != nil {
+				t.Fatalf("error opening new file at imagePath: %v", err)
+			}
+			defer file.Close()
+			defer os.RemoveAll(imagePath)
+
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
+
+			// Checkpoint running container; save state into new file.
+			if err := cont.Checkpoint(file); err != nil {
+				t.Fatalf("error checkpointing container to empty file: %v", err)
+			}
+
+			// Read last number outputted before checkpoint.
+			lastNum, err := readOutputNum(outputPath, -1)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
+
+			// Delete and recreate file before restoring.
+			if err := os.Remove(outputPath); err != nil {
+				t.Fatalf("error removing file")
+			}
+			outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
+			if err != nil {
+				t.Fatalf("error creating output file: %v", err)
+			}
+			defer outputFile2.Close()
+
+			// Restore into a new container.
+			argsRestore := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			contRestore, err := New(conf, argsRestore)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer contRestore.Destroy()
+
+			if err := contRestore.Restore(spec, conf, imagePath); err != nil {
+				t.Fatalf("error restoring container: %v", err)
+			}
+
+			// Wait until application has ran.
+			if err := waitForFileNotEmpty(outputFile2); err != nil {
+				t.Fatalf("Failed to wait for output file: %v", err)
+			}
+
+			// Read first number outputted after restore.
+			firstNum, err := readOutputNum(outputPath, 0)
+			if err != nil {
+				t.Fatalf("error with outputFile: %v", err)
+			}
+
+			// Check that lastNum is one less than firstNum.
+			if lastNum+1 != firstNum {
+				t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum)
+			}
+			contRestore.Destroy()
+		})
+	}
+}
+
+// TestPauseResume tests that we can successfully pause and resume a container.
+// The container will keep touching a file to indicate it's running. The test
+// pauses the container, removes the file, and checks that it doesn't get
+// recreated. Then it resumes the container, verify that the file gets created
+// again.
+func TestPauseResume(t *testing.T) {
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
+			if err != nil {
+				t.Fatalf("error creating temp dir: %v", err)
+			}
+			defer os.RemoveAll(tmpDir)
+
+			running := path.Join(tmpDir, "running")
+			script := fmt.Sprintf("while [[ true ]]; do touch %q; sleep 0.1; done", running)
+			spec := testutil.NewSpecWithArgs("/bin/bash", "-c", script)
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Wait until container starts running, observed by the existence of running
+			// file.
+			if err := waitForFileExist(running); err != nil {
+				t.Errorf("error waiting for container to start: %v", err)
+			}
+
+			// Pause the running container.
+			if err := cont.Pause(); err != nil {
+				t.Errorf("error pausing container: %v", err)
+			}
+			if got, want := cont.Status, Paused; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
+
+			if err := os.Remove(running); err != nil {
+				t.Fatalf("os.Remove(%q) failed: %v", running, err)
+			}
+			// Script touches the file every 100ms. Give a bit a time for it to run to
+			// catch the case that pause didn't work.
+			time.Sleep(200 * time.Millisecond)
+			if _, err := os.Stat(running); !os.IsNotExist(err) {
+				t.Fatalf("container did not pause: file exist check: %v", err)
+			}
+
+			// Resume the running container.
+			if err := cont.Resume(); err != nil {
+				t.Errorf("error pausing container: %v", err)
+			}
+			if got, want := cont.Status, Running; got != want {
+				t.Errorf("container status got %v, want %v", got, want)
+			}
+
+			// Verify that the file is once again created by container.
+			if err := waitForFileExist(running); err != nil {
+				t.Fatalf("error resuming container: file exist check: %v", err)
+			}
+		})
+	}
+}
+
+// TestPauseResumeStatus makes sure that the statuses are set correctly
+// with calls to pause and resume and that pausing and resuming only
+// occurs given the correct state.
+func TestPauseResumeStatus(t *testing.T) {
+	spec := testutil.NewSpecWithArgs("sleep", "20")
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create and start the container.
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	cont, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer cont.Destroy()
+	if err := cont.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Pause the running container.
+	if err := cont.Pause(); err != nil {
+		t.Errorf("error pausing container: %v", err)
+	}
+	if got, want := cont.Status, Paused; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Try to Pause again. Should cause error.
+	if err := cont.Pause(); err == nil {
+		t.Errorf("error pausing container that was already paused: %v", err)
+	}
+	if got, want := cont.Status, Paused; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Resume the running container.
+	if err := cont.Resume(); err != nil {
+		t.Errorf("error resuming container: %v", err)
+	}
+	if got, want := cont.Status, Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+
+	// Try to resume again. Should cause error.
+	if err := cont.Resume(); err == nil {
+		t.Errorf("error resuming container already running: %v", err)
+	}
+	if got, want := cont.Status, Running; got != want {
+		t.Errorf("container status got %v, want %v", got, want)
+	}
+}
+
+// TestCapabilities verifies that:
+// - Running exec as non-root UID and GID will result in an error (because the
+//   executable file can't be read).
+// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips
+//   this check.
+func TestCapabilities(t *testing.T) {
+	// Pick uid/gid different than ours.
+	uid := auth.KUID(os.Getuid() + 1)
+	gid := auth.KGID(os.Getgid() + 1)
+
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				{
+					UID:     0,
+					PID:     1,
+					PPID:    0,
+					C:       0,
+					Cmd:     "sleep",
+					Threads: []kernel.ThreadID{1},
+				},
+				{
+					UID:     uid,
+					PID:     2,
+					PPID:    0,
+					C:       0,
+					Cmd:     "exe",
+					Threads: []kernel.ThreadID{2},
+				},
+			}
+			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+				t.Fatalf("Failed to wait for sleep to start, err: %v", err)
+			}
+
+			// Create an executable that can't be run with the specified UID:GID.
+			// This shouldn't be callable within the container until we add the
+			// CAP_DAC_OVERRIDE capability to skip the access check.
+			exePath := filepath.Join(rootDir, "exe")
+			if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil {
+				t.Fatalf("couldn't create executable: %v", err)
+			}
+			defer os.Remove(exePath)
+
+			// Need to traverse the intermediate directory.
+			os.Chmod(rootDir, 0755)
+
+			execArgs := &control.ExecArgs{
+				Filename:         exePath,
+				Argv:             []string{exePath},
+				WorkingDirectory: "/",
+				KUID:             uid,
+				KGID:             gid,
+				Capabilities:     &auth.TaskCapabilities{},
+			}
+
+			// "exe" should fail because we don't have the necessary permissions.
+			if _, err := cont.executeSync(execArgs); err == nil {
+				t.Fatalf("container executed without error, but an error was expected")
+			}
+
+			// Now we run with the capability enabled and should succeed.
+			execArgs.Capabilities = &auth.TaskCapabilities{
+				EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE),
+			}
+			// "exe" should not fail this time.
+			if _, err := cont.executeSync(execArgs); err != nil {
+				t.Fatalf("container failed to exec %v: %v", args, err)
+			}
+		})
+	}
+}
+
+// TestRunNonRoot checks that sandbox can be configured when running as
+// non-privileged user.
+func TestRunNonRoot(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("/bin/true")
+
+			// Set a random user/group with no access to "blocked" dir.
+			spec.Process.User.UID = 343
+			spec.Process.User.GID = 2401
+			spec.Process.Capabilities = nil
+
+			// User running inside container can't list '$TMP/blocked' and would fail to
+			// mount it.
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			if err := os.Chmod(dir, 0700); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+			}
+			dir = path.Join(dir, "test")
+			if err := os.Mkdir(dir, 0755); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+			}
+
+			src, err := ioutil.TempDir(testutil.TmpDir(), "src")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      src,
+				Type:        "bind",
+			})
+
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("error running sandbox: %v", err)
+			}
+		})
+	}
+}
+
+// TestMountNewDir checks that runsc will create destination directory if it
+// doesn't exit.
+func TestMountNewDir(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			root, err := ioutil.TempDir(testutil.TmpDir(), "root")
+			if err != nil {
+				t.Fatal("ioutil.TempDir() failed:", err)
+			}
+
+			srcDir := path.Join(root, "src", "dir", "anotherdir")
+			if err := os.MkdirAll(srcDir, 0755); err != nil {
+				t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err)
+			}
+
+			mountDir := path.Join(root, "dir", "anotherdir")
+
+			spec := testutil.NewSpecWithArgs("/bin/ls", mountDir)
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: mountDir,
+				Source:      srcDir,
+				Type:        "bind",
+			})
+
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("error running sandbox: %v", err)
+			}
+		})
+	}
+}
+
+func TestReadonlyRoot(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+			spec.Root.Readonly = true
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+		})
+	}
+}
+
+func TestUIDMap(t *testing.T) {
+	for name, conf := range configs(t, noOverlay...) {
+		t.Run(name, func(t *testing.T) {
+			testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			defer os.RemoveAll(testDir)
+			testFile := path.Join(testDir, "testfile")
+
+			spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile")
+			uid := os.Getuid()
+			gid := os.Getgid()
+			spec.Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{Type: specs.UserNamespace},
+					{Type: specs.PIDNamespace},
+					{Type: specs.MountNamespace},
+				},
+				UIDMappings: []specs.LinuxIDMapping{
+					{
+						ContainerID: 0,
+						HostID:      uint32(uid),
+						Size:        1,
+					},
+				},
+				GIDMappings: []specs.LinuxIDMapping{
+					{
+						ContainerID: 0,
+						HostID:      uint32(gid),
+						Size:        1,
+					},
+				},
+			}
+
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: "/tmp",
+				Source:      testDir,
+				Type:        "bind",
+			})
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || ws.ExitStatus() != 0 {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+			st := syscall.Stat_t{}
+			if err := syscall.Stat(testFile, &st); err != nil {
+				t.Fatalf("error stat /testfile: %v", err)
+			}
+
+			if st.Uid != uint32(uid) || st.Gid != uint32(gid) {
+				t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid)
+			}
+		})
+	}
+}
+
+func TestReadonlyMount(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      dir,
+				Type:        "bind",
+				Options:     []string{"ro"},
+			})
+			spec.Root.Readonly = false
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create, start and wait for the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("container failed, waitStatus: %v", ws)
+			}
+		})
+	}
+}
+
+func TestBindMountByOption(t *testing.T) {
+	for _, conf := range configs(t, overlay) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		dir, err := ioutil.TempDir(testutil.TmpDir(), "bind-mount")
+		spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+		if err != nil {
+			t.Fatalf("ioutil.TempDir() failed: %v", err)
+		}
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: dir,
+			Source:      dir,
+			Type:        "none",
+			Options:     []string{"rw", "bind"},
+		})
+
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("error running sandbox: %v", err)
+		}
+	}
+}
+
+// TestAbbreviatedIDs checks that runsc supports using abbreviated container
+// IDs in place of full IDs.
+func TestAbbreviatedIDs(t *testing.T) {
+	doAbbreviatedIDsTest(t, false)
+}
+
+func TestAbbreviatedIDsVFS2(t *testing.T) {
+	doAbbreviatedIDsTest(t, true)
+}
+
+func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+	conf.VFS2 = vfs2
+
+	cids := []string{
+		"foo-" + testutil.RandomContainerID(),
+		"bar-" + testutil.RandomContainerID(),
+		"baz-" + testutil.RandomContainerID(),
+	}
+	for _, cid := range cids {
+		spec := testutil.NewSpecWithArgs("sleep", "100")
+		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer cleanup()
+
+		// Create and start the container.
+		args := Args{
+			ID:        cid,
+			Spec:      spec,
+			BundleDir: bundleDir,
+		}
+		cont, err := New(conf, args)
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+	}
+
+	// These should all be unambigious.
+	unambiguous := map[string]string{
+		"f":     cids[0],
+		cids[0]: cids[0],
+		"bar":   cids[1],
+		cids[1]: cids[1],
+		"baz":   cids[2],
+		cids[2]: cids[2],
+	}
+	for shortid, longid := range unambiguous {
+		if _, err := Load(rootDir, shortid); err != nil {
+			t.Errorf("%q should resolve to %q: %v", shortid, longid, err)
+		}
+	}
+
+	// These should be ambiguous.
+	ambiguous := []string{
+		"b",
+		"ba",
+	}
+	for _, shortid := range ambiguous {
+		if s, err := Load(rootDir, shortid); err == nil {
+			t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID)
+		}
+	}
+}
+
+func TestGoferExits(t *testing.T) {
+	doGoferExitTest(t, false)
+}
+
+func TestGoferExitsVFS2(t *testing.T) {
+	doGoferExitTest(t, true)
+}
+
+func doGoferExitTest(t *testing.T, vfs2 bool) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
+	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create and start the container.
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	c, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// Kill sandbox and expect gofer to exit on its own.
+	sandboxProc, err := os.FindProcess(c.Sandbox.Pid)
+	if err != nil {
+		t.Fatalf("error finding sandbox process: %v", err)
+	}
+	if err := sandboxProc.Kill(); err != nil {
+		t.Fatalf("error killing sandbox process: %v", err)
+	}
+
+	err = blockUntilWaitable(c.GoferPid)
+	if err != nil && err != syscall.ECHILD {
+		t.Errorf("error waiting for gofer to exit: %v", err)
+	}
+}
+
+func TestRootNotMount(t *testing.T) {
+	appSym, err := testutil.FindFile("test/cmd/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	app, err := filepath.EvalSymlinks(appSym)
+	if err != nil {
+		t.Fatalf("error resolving %q symlink: %v", appSym, err)
+	}
+	log.Infof("App path %q is a symlink to %q", appSym, app)
+
+	static, err := testutil.IsStatic(app)
+	if err != nil {
+		t.Fatalf("error reading application binary: %v", err)
+	}
+	if !static {
+		// This happens during race builds; we cannot map in shared
+		// libraries also, so we need to skip the test.
+		t.Skip()
+	}
+
+	root := filepath.Dir(app)
+	exe := "/" + filepath.Base(app)
+	log.Infof("Executing %q in %q", exe, root)
+
+	spec := testutil.NewSpecWithArgs(exe, "help")
+	spec.Root.Path = root
+	spec.Root.Readonly = true
+	spec.Mounts = nil
+
+	conf := testutil.TestConfig(t)
+	if err := run(spec, conf); err != nil {
+		t.Fatalf("error running sandbox: %v", err)
+	}
+}
+
+func TestUserLog(t *testing.T) {
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	// sched_rr_get_interval = 148 - not implemented in gvisor.
+	spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "user_log_test")
+	if err != nil {
+		t.Fatalf("error creating tmp dir: %v", err)
+	}
+	userLog := filepath.Join(dir, "user.log")
+
+	// Create, start and wait for the container.
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+		UserLog:   userLog,
+		Attached:  true,
+	}
+	ws, err := Run(conf, args)
+	if err != nil {
+		t.Fatalf("error running container: %v", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Fatalf("container failed, waitStatus: %v", ws)
+	}
+
+	out, err := ioutil.ReadFile(userLog)
+	if err != nil {
+		t.Fatalf("error opening user log file %q: %v", userLog, err)
+	}
+	if want := "Unsupported syscall: sched_rr_get_interval"; !strings.Contains(string(out), want) {
+		t.Errorf("user log file doesn't contain %q, out: %s", want, string(out))
+	}
+}
+
+func TestWaitOnExitedSandbox(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			// Run a shell that sleeps for 1 second and then exits with a
+			// non-zero code.
+			const wantExit = 17
+			cmd := fmt.Sprintf("sleep 1; exit %d", wantExit)
+			spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and Start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Wait on the sandbox. This will make an RPC to the sandbox
+			// and get the actual exit status of the application.
+			ws, err := c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if got := ws.ExitStatus(); got != wantExit {
+				t.Errorf("got exit status %d, want %d", got, wantExit)
+			}
+
+			// Now the sandbox has exited, but the zombie sandbox process
+			// still exists. Calling Wait() now will return the sandbox
+			// exit status.
+			ws, err = c.Wait()
+			if err != nil {
+				t.Fatalf("error waiting on container: %v", err)
+			}
+			if got := ws.ExitStatus(); got != wantExit {
+				t.Errorf("got exit status %d, want %d", got, wantExit)
+			}
+		})
+	}
+}
+
+func TestDestroyNotStarted(t *testing.T) {
+	doDestroyNotStartedTest(t, false)
+}
+
+func TestDestroyNotStartedVFS2(t *testing.T) {
+	doDestroyNotStartedTest(t, true)
+}
+
+func doDestroyNotStartedTest(t *testing.T, vfs2 bool) {
+	spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
+	conf := testutil.TestConfig(t)
+	conf.VFS2 = vfs2
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create the container and check that it can be destroyed.
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	c, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	if err := c.Destroy(); err != nil {
+		t.Fatalf("deleting non-started container failed: %v", err)
+	}
+}
+
+// TestDestroyStarting attempts to force a race between start and destroy.
+func TestDestroyStarting(t *testing.T) {
+	doDestroyNotStartedTest(t, false)
+}
+
+func TestDestroyStartedVFS2(t *testing.T) {
+	doDestroyNotStartedTest(t, true)
+}
+
+func doDestroyStartingTest(t *testing.T, vfs2 bool) {
+	for i := 0; i < 10; i++ {
+		spec := testutil.NewSpecWithArgs("/bin/sleep", "100")
+		conf := testutil.TestConfig(t)
+		conf.VFS2 = vfs2
+		rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer cleanup()
+
+		// Create the container and check that it can be destroyed.
+		args := Args{
+			ID:        testutil.RandomContainerID(),
+			Spec:      spec,
+			BundleDir: bundleDir,
+		}
+		c, err := New(conf, args)
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+
+		// Container is not thread safe, so load another instance to run in
+		// concurrently.
+		startCont, err := Load(rootDir, args.ID)
+		if err != nil {
+			t.Fatalf("error loading container: %v", err)
+		}
+		wg := sync.WaitGroup{}
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			// Ignore failures, start can fail if destroy runs first.
+			startCont.Start(conf)
+		}()
+
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			if err := c.Destroy(); err != nil {
+				t.Errorf("deleting non-started container failed: %v", err)
+			}
+		}()
+		wg.Wait()
+	}
+}
+
+func TestCreateWorkingDir(t *testing.T) {
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			dir := path.Join(tmpDir, "new/working/dir")
+
+			// touch will fail if the directory doesn't exist.
+			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
+			spec.Process.Cwd = dir
+			spec.Root.Readonly = true
+
+			if err := run(spec, conf); err != nil {
+				t.Fatalf("Error running container: %v", err)
+			}
+		})
+	}
+}
+
+// TestMountPropagation verifies that mount propagates to slave but not to
+// private mounts.
+func TestMountPropagation(t *testing.T) {
+	// Setup dir structure:
+	//   - src: is mounted as shared and is used as source for both private and
+	//     slave mounts
+	//   - dir: will be bind mounted inside src and should propagate to slave
+	tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "mount")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed: %v", err)
+	}
+	src := filepath.Join(tmpDir, "src")
+	srcMnt := filepath.Join(src, "mnt")
+	dir := filepath.Join(tmpDir, "dir")
+	for _, path := range []string{src, srcMnt, dir} {
+		if err := os.MkdirAll(path, 0777); err != nil {
+			t.Fatalf("MkdirAll(%q): %v", path, err)
+		}
+	}
+	dirFile := filepath.Join(dir, "file")
+	f, err := os.Create(dirFile)
+	if err != nil {
+		t.Fatalf("os.Create(%q): %v", dirFile, err)
+	}
+	f.Close()
+
+	// Setup src as a shared mount.
+	if err := syscall.Mount(src, src, "bind", syscall.MS_BIND, ""); err != nil {
+		t.Fatalf("mount(%q, %q, MS_BIND): %v", dir, srcMnt, err)
+	}
+	if err := syscall.Mount("", src, "", syscall.MS_SHARED, ""); err != nil {
+		t.Fatalf("mount(%q, MS_SHARED): %v", srcMnt, err)
+	}
+
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	priv := filepath.Join(tmpDir, "priv")
+	slave := filepath.Join(tmpDir, "slave")
+	spec.Mounts = []specs.Mount{
+		{
+			Source:      src,
+			Destination: priv,
+			Type:        "bind",
+			Options:     []string{"private"},
+		},
+		{
+			Source:      src,
+			Destination: slave,
+			Type:        "bind",
+			Options:     []string{"slave"},
+		},
+	}
+
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	cont, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("creating container: %v", err)
+	}
+	defer cont.Destroy()
+
+	if err := cont.Start(conf); err != nil {
+		t.Fatalf("starting container: %v", err)
+	}
+
+	// After the container is started, mount dir inside source and check what
+	// happens to both destinations.
+	if err := syscall.Mount(dir, srcMnt, "bind", syscall.MS_BIND, ""); err != nil {
+		t.Fatalf("mount(%q, %q, MS_BIND): %v", dir, srcMnt, err)
+	}
+
+	// Check that mount didn't propagate to private mount.
+	privFile := filepath.Join(priv, "mnt", "file")
+	execArgs := &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "!", "-f", privFile},
+	}
+	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+		t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err)
+	}
+
+	// Check that mount propagated to slave mount.
+	slaveFile := filepath.Join(slave, "mnt", "file")
+	execArgs = &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", slaveFile},
+	}
+	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+		t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err)
+	}
+}
+
+func TestMountSymlink(t *testing.T) {
+	for name, conf := range configs(t, overlay) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			source := path.Join(dir, "source")
+			target := path.Join(dir, "target")
+			for _, path := range []string{source, target} {
+				if err := os.MkdirAll(path, 0777); err != nil {
+					t.Fatalf("os.MkdirAll(): %v", err)
+				}
+			}
+			f, err := os.Create(path.Join(source, "file"))
+			if err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+			f.Close()
+
+			link := path.Join(dir, "link")
+			if err := os.Symlink(target, link); err != nil {
+				t.Fatalf("os.Symlink(%q, %q): %v", target, link, err)
+			}
+
+			spec := testutil.NewSpecWithArgs("/bin/sleep", "1000")
+
+			// Mount to a symlink to ensure the mount code will follow it and mount
+			// at the symlink target.
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Type:        "bind",
+				Destination: link,
+				Source:      source,
+			})
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("creating container: %v", err)
+			}
+			defer cont.Destroy()
+
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("starting container: %v", err)
+			}
+
+			// Check that symlink was resolved and mount was created where the symlink
+			// is pointing to.
+			file := path.Join(target, "file")
+			execArgs := &control.ExecArgs{
+				Filename: "/usr/bin/test",
+				Argv:     []string{"test", "-f", file},
+			}
+			if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+				t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
+			}
+		})
+	}
+}
+
+// Check that --net-raw disables the CAP_NET_RAW capability.
+func TestNetRaw(t *testing.T) {
+	capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10)
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	for _, enableRaw := range []bool{true, false} {
+		conf := testutil.TestConfig(t)
+		conf.EnableRaw = enableRaw
+
+		test := "--enabled"
+		if !enableRaw {
+			test = "--disabled"
+		}
+
+		spec := testutil.NewSpecWithArgs(app, "capability", test, capNetRaw)
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("Error running container: %v", err)
+		}
+	}
+}
+
+// TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works.
+func TestOverlayfsStaleRead(t *testing.T) {
+	conf := testutil.TestConfig(t)
+	conf.OverlayfsStaleRead = true
+
+	in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed: %v", err)
+	}
+	defer in.Close()
+	if _, err := in.WriteString("stale data"); err != nil {
+		t.Fatalf("in.Write() failed: %v", err)
+	}
+
+	out, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.out")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed: %v", err)
+	}
+	defer out.Close()
+
+	const want = "foobar"
+	cmd := fmt.Sprintf("cat %q >&2 && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name())
+	spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd)
+	if err := run(spec, conf); err != nil {
+		t.Fatalf("Error running container: %v", err)
+	}
+
+	gotBytes, err := ioutil.ReadAll(out)
+	if err != nil {
+		t.Fatalf("out.Read() failed: %v", err)
+	}
+	got := strings.TrimSpace(string(gotBytes))
+	if want != got {
+		t.Errorf("Wrong content in out file, got: %q. want: %q", got, want)
+	}
+}
+
+// TestTTYField checks TTY field returned by container.Processes().
+func TestTTYField(t *testing.T) {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	testApp, err := testutil.FindFile("test/cmd/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	testCases := []struct {
+		name         string
+		useTTY       bool
+		wantTTYField string
+	}{
+		{
+			name:         "no tty",
+			useTTY:       false,
+			wantTTYField: "?",
+		},
+		{
+			name:         "tty used",
+			useTTY:       true,
+			wantTTYField: "pts/0",
+		},
+	}
+
+	for _, test := range testCases {
+		for _, vfs2 := range []bool{false, true} {
+			name := test.name
+			if vfs2 {
+				name += "-vfs2"
+			}
+			t.Run(name, func(t *testing.T) {
+				conf := testutil.TestConfig(t)
+				conf.VFS2 = vfs2
+
+				// We will run /bin/sleep, possibly with an open TTY.
+				cmd := []string{"/bin/sleep", "10000"}
+				if test.useTTY {
+					// Run inside the "pty-runner".
+					cmd = append([]string{testApp, "pty-runner"}, cmd...)
+				}
+
+				spec := testutil.NewSpecWithArgs(cmd...)
+				_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+				if err != nil {
+					t.Fatalf("error setting up container: %v", err)
+				}
+				defer cleanup()
+
+				// Create and start the container.
+				args := Args{
+					ID:        testutil.RandomContainerID(),
+					Spec:      spec,
+					BundleDir: bundleDir,
+				}
+				c, err := New(conf, args)
+				if err != nil {
+					t.Fatalf("error creating container: %v", err)
+				}
+				defer c.Destroy()
+				if err := c.Start(conf); err != nil {
+					t.Fatalf("error starting container: %v", err)
+				}
+
+				// Wait for sleep to be running, and check the TTY
+				// field.
+				var gotTTYField string
+				cb := func() error {
+					ps, err := c.Processes()
+					if err != nil {
+						err = fmt.Errorf("error getting process data from container: %v", err)
+						return &backoff.PermanentError{Err: err}
+					}
+					for _, p := range ps {
+						if strings.Contains(p.Cmd, "sleep") {
+							gotTTYField = p.TTY
+							return nil
+						}
+					}
+					return fmt.Errorf("sleep not running")
+				}
+				if err := testutil.Poll(cb, 30*time.Second); err != nil {
+					t.Fatalf("error waiting for sleep process: %v", err)
+				}
+
+				if gotTTYField != test.wantTTYField {
+					t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
+				}
+			})
+		}
+	}
+}
+
+// executeSync synchronously executes a new process.
+func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
+	pid, err := cont.Execute(args)
+	if err != nil {
+		return 0, fmt.Errorf("error executing: %v", err)
+	}
+	ws, err := cont.WaitPID(pid)
+	if err != nil {
+		return 0, fmt.Errorf("error waiting: %v", err)
+	}
+	return ws, nil
+}
+
+func TestMain(m *testing.M) {
+	log.SetLevel(log.Debug)
+	flag.Parse()
+	if err := testutil.ConfigureExePath(); err != nil {
+		panic(err.Error())
+	}
+	specutils.MaybeRunAsRoot()
+	os.Exit(m.Run())
+}
diff --git a/runsc/container/hook.go b/runsc/container/hook.go
new file mode 100644
index 000000000..901607aee
--- /dev/null
+++ b/runsc/container/hook.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// This file implements hooks as defined in OCI spec:
+// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22
+//
+// "hooks":{
+// 		"prestart":[{
+// 			"path":"/usr/bin/dockerd",
+// 			"args":[
+// 				"libnetwork-setkey", "arg2",
+// 			]
+// 		}]
+// },
+
+// executeHooksBestEffort executes hooks and logs warning in case they fail.
+// Runs all hooks, always.
+func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
+	for _, h := range hooks {
+		if err := executeHook(h, s); err != nil {
+			log.Warningf("Failure to execute hook %+v, err: %v", h, err)
+		}
+	}
+}
+
+// executeHooks executes hooks until the first one fails or they all execute.
+func executeHooks(hooks []specs.Hook, s specs.State) error {
+	for _, h := range hooks {
+		if err := executeHook(h, s); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func executeHook(h specs.Hook, s specs.State) error {
+	log.Debugf("Executing hook %+v, state: %+v", h, s)
+
+	if strings.TrimSpace(h.Path) == "" {
+		return fmt.Errorf("empty path for hook")
+	}
+	if !filepath.IsAbs(h.Path) {
+		return fmt.Errorf("path for hook is not absolute: %q", h.Path)
+	}
+
+	b, err := json.Marshal(s)
+	if err != nil {
+		return err
+	}
+	var stdout, stderr bytes.Buffer
+	cmd := exec.Cmd{
+		Path:   h.Path,
+		Args:   h.Args,
+		Env:    h.Env,
+		Stdin:  bytes.NewReader(b),
+		Stdout: &stdout,
+		Stderr: &stderr,
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+
+	c := make(chan error, 1)
+	go func() {
+		c <- cmd.Wait()
+	}()
+
+	var timer <-chan time.Time
+	if h.Timeout != nil {
+		timer = time.After(time.Duration(*h.Timeout) * time.Second)
+	}
+	select {
+	case err := <-c:
+		if err != nil {
+			return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String())
+		}
+	case <-timer:
+		cmd.Process.Kill()
+		cmd.Wait()
+		return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String())
+	}
+
+	log.Debugf("Execute hook %q success!", h.Path)
+	return nil
+}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
new file mode 100644
index 000000000..f6861b1dd
--- /dev/null
+++ b/runsc/container/multi_container_test.go
@@ -0,0 +1,1711 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"fmt"
+	"io/ioutil"
+	"math"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
+	var specs []*specs.Spec
+	var ids []string
+	rootID := testutil.RandomContainerID()
+
+	for i, cmd := range cmds {
+		spec := testutil.NewSpecWithArgs(cmd...)
+		if i == 0 {
+			spec.Annotations = map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+			}
+			ids = append(ids, rootID)
+		} else {
+			spec.Annotations = map[string]string{
+				specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+				specutils.ContainerdSandboxIDAnnotation:     rootID,
+			}
+			ids = append(ids, testutil.RandomContainerID())
+		}
+		specs = append(specs, spec)
+	}
+	return specs, ids
+}
+
+func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
+	if len(conf.RootDir) == 0 {
+		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
+	}
+
+	var (
+		containers []*Container
+		cleanups   []func()
+	)
+	cleanups = append(cleanups, func() {
+		for _, c := range containers {
+			c.Destroy()
+		}
+	})
+	cleanupAll := func() {
+		for _, c := range cleanups {
+			c()
+		}
+	}
+	localClean := specutils.MakeCleanup(cleanupAll)
+	defer localClean.Clean()
+
+	for i, spec := range specs {
+		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
+		if err != nil {
+			return nil, nil, fmt.Errorf("error setting up container: %v", err)
+		}
+		cleanups = append(cleanups, cleanup)
+
+		args := Args{
+			ID:        ids[i],
+			Spec:      spec,
+			BundleDir: bundleDir,
+		}
+		cont, err := New(conf, args)
+		if err != nil {
+			return nil, nil, fmt.Errorf("error creating container: %v", err)
+		}
+		containers = append(containers, cont)
+
+		if err := cont.Start(conf); err != nil {
+			return nil, nil, fmt.Errorf("error starting container: %v", err)
+		}
+	}
+
+	localClean.Release()
+	return containers, cleanupAll, nil
+}
+
+type execDesc struct {
+	c    *Container
+	cmd  []string
+	want int
+	desc string
+}
+
+func execMany(execs []execDesc) error {
+	for _, exec := range execs {
+		args := &control.ExecArgs{Argv: exec.cmd}
+		if ws, err := exec.c.executeSync(args); err != nil {
+			return fmt.Errorf("error executing %+v: %v", args, err)
+		} else if ws.ExitStatus() != exec.want {
+			return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want)
+		}
+	}
+	return nil
+}
+
+func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
+	for _, spec := range pod {
+		spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source
+		spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type
+		spec.Annotations[boot.MountPrefix+name+".share"] = "pod"
+		if len(mount.Options) > 0 {
+			spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",")
+		}
+	}
+}
+
+// TestMultiContainerSanity checks that it is possible to run 2 dead-simple
+// containers in the same sandbox.
+func TestMultiContainerSanity(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			specs, ids := createSpecs(sleep, sleep)
+			containers, cleanup, err := startContainers(conf, specs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			expectedPL = []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
+	}
+}
+
+// TestMultiPIDNS checks that it is possible to run 2 dead-simple
+// containers in the same sandbox with different pidns.
+func TestMultiPIDNS(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			testSpecs, ids := createSpecs(sleep, sleep)
+			testSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+					},
+				},
+			}
+
+			containers, cleanup, err := startContainers(conf, testSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			expectedPL = []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
+	}
+}
+
+// TestMultiPIDNSPath checks the pidns path.
+func TestMultiPIDNSPath(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			testSpecs, ids := createSpecs(sleep, sleep, sleep)
+			testSpecs[0].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/1/ns/pid",
+					},
+				},
+			}
+			testSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/1/ns/pid",
+					},
+				},
+			}
+			testSpecs[2].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{
+					{
+						Type: "pid",
+						Path: "/proc/2/ns/pid",
+					},
+				},
+			}
+
+			containers, cleanup, err := startContainers(conf, testSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			// Check via ps that multiple processes are running.
+			expectedPL := []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+			if err := waitForProcessList(containers[2], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+
+			expectedPL = []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+		})
+	}
+}
+
+func TestMultiContainerWait(t *testing.T) {
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	// The first container should run the entire duration of the test.
+	cmd1 := []string{"sleep", "100"}
+	// We'll wait on the second container, which is much shorter lived.
+	cmd2 := []string{"sleep", "1"}
+	specs, ids := createSpecs(cmd1, cmd2)
+
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Check via ps that multiple processes are running.
+	expectedPL := []*control.Process{
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+	}
+	if err := waitForProcessList(containers[1], expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+
+	// Wait on the short lived container from multiple goroutines.
+	wg := sync.WaitGroup{}
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func(c *Container) {
+			defer wg.Done()
+			if ws, err := c.Wait(); err != nil {
+				t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err)
+			} else if es := ws.ExitStatus(); es != 0 {
+				t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
+			}
+			if _, err := c.Wait(); err != nil {
+				t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err)
+			}
+		}(containers[1])
+	}
+
+	// Also wait via PID.
+	for i := 0; i < 3; i++ {
+		wg.Add(1)
+		go func(c *Container) {
+			defer wg.Done()
+			const pid = 2
+			if ws, err := c.WaitPID(pid); err != nil {
+				t.Errorf("failed to wait for PID %d: %v", pid, err)
+			} else if es := ws.ExitStatus(); es != 0 {
+				t.Errorf("PID %d exited with non-zero status %d", pid, es)
+			}
+			if _, err := c.WaitPID(pid); err == nil {
+				t.Errorf("wait for stopped PID %d should fail", pid)
+			}
+		}(containers[1])
+	}
+
+	wg.Wait()
+
+	// After Wait returns, ensure that the root container is running and
+	// the child has finished.
+	expectedPL = []*control.Process{
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
+	}
+}
+
+// TestExecWait ensures what we can wait containers and individual processes in the
+// sandbox that have already exited.
+func TestExecWait(t *testing.T) {
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	// The first container should run the entire duration of the test.
+	cmd1 := []string{"sleep", "100"}
+	// We'll wait on the second container, which is much shorter lived.
+	cmd2 := []string{"sleep", "1"}
+	specs, ids := createSpecs(cmd1, cmd2)
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Check via ps that process is running.
+	expectedPL := []*control.Process{
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+	}
+	if err := waitForProcessList(containers[1], expectedPL); err != nil {
+		t.Fatalf("failed to wait for sleep to start: %v", err)
+	}
+
+	// Wait for the second container to finish.
+	if err := waitForProcessCount(containers[1], 0); err != nil {
+		t.Fatalf("failed to wait for second container to stop: %v", err)
+	}
+
+	// Get the second container exit status.
+	if ws, err := containers[1].Wait(); err != nil {
+		t.Fatalf("failed to wait for process %s: %v", containers[1].Spec.Process.Args, err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Fatalf("process %s exited with non-zero status %d", containers[1].Spec.Process.Args, es)
+	}
+	if _, err := containers[1].Wait(); err != nil {
+		t.Fatalf("wait for stopped container %s shouldn't fail: %v", containers[1].Spec.Process.Args, err)
+	}
+
+	// Execute another process in the first container.
+	args := &control.ExecArgs{
+		Filename:         "/bin/sleep",
+		Argv:             []string{"/bin/sleep", "1"},
+		WorkingDirectory: "/",
+		KUID:             0,
+	}
+	pid, err := containers[0].Execute(args)
+	if err != nil {
+		t.Fatalf("error executing: %v", err)
+	}
+
+	// Wait for the exec'd process to exit.
+	expectedPL = []*control.Process{
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Fatalf("failed to wait for second container to stop: %v", err)
+	}
+
+	// Get the exit status from the exec'd process.
+	if ws, err := containers[0].WaitPID(pid); err != nil {
+		t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Fatalf("process %+v exited with non-zero status %d", args, es)
+	}
+	if _, err := containers[0].WaitPID(pid); err == nil {
+		t.Fatalf("wait for stopped process %+v should fail", args)
+	}
+}
+
+// TestMultiContainerMount tests that bind mounts can be used with multiple
+// containers.
+func TestMultiContainerMount(t *testing.T) {
+	cmd1 := []string{"sleep", "100"}
+
+	// 'src != dst' ensures that 'dst' doesn't exist in the host and must be
+	// properly mapped inside the container to work.
+	src, err := ioutil.TempDir(testutil.TmpDir(), "container")
+	if err != nil {
+		t.Fatal("ioutil.TempDir failed:", err)
+	}
+	dst := src + ".dst"
+	cmd2 := []string{"touch", filepath.Join(dst, "file")}
+
+	sps, ids := createSpecs(cmd1, cmd2)
+	sps[1].Mounts = append(sps[1].Mounts, specs.Mount{
+		Source:      src,
+		Destination: dst,
+		Type:        "bind",
+	})
+
+	// Setup the containers.
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	containers, cleanup, err := startContainers(conf, sps, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	ws, err := containers[1].Wait()
+	if err != nil {
+		t.Error("error waiting on container:", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Error("container failed, waitStatus:", ws)
+	}
+}
+
+// TestMultiContainerSignal checks that it is possible to signal individual
+// containers without killing the entire sandbox.
+func TestMultiContainerSignal(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			specs, ids := createSpecs(sleep, sleep)
+			containers, cleanup, err := startContainers(conf, specs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			// Check via ps that container 1 process is running.
+			expectedPL := []*control.Process{
+				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+			}
+
+			if err := waitForProcessList(containers[1], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+
+			// Kill process 2.
+			if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil {
+				t.Errorf("failed to kill process 2: %v", err)
+			}
+
+			// Make sure process 1 is still running.
+			expectedPL = []*control.Process{
+				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+			}
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+
+			// goferPid is reset when container is destroyed.
+			goferPid := containers[1].GoferPid
+
+			// Destroy container and ensure container's gofer process has exited.
+			if err := containers[1].Destroy(); err != nil {
+				t.Errorf("failed to destroy container: %v", err)
+			}
+			_, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) {
+				cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
+				return uintptr(cpid), 0, err
+			})
+			if err != syscall.ECHILD {
+				t.Errorf("error waiting for gofer to exit: %v", err)
+			}
+			// Make sure process 1 is still running.
+			if err := waitForProcessList(containers[0], expectedPL); err != nil {
+				t.Errorf("failed to wait for sleep to start: %v", err)
+			}
+
+			// Now that process 2 is gone, ensure we get an error trying to
+			// signal it again.
+			if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil {
+				t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID)
+			}
+
+			// Kill process 1.
+			if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil {
+				t.Errorf("failed to kill process 1: %v", err)
+			}
+
+			// Ensure that container's gofer and sandbox process are no more.
+			err = blockUntilWaitable(containers[0].GoferPid)
+			if err != nil && err != syscall.ECHILD {
+				t.Errorf("error waiting for gofer to exit: %v", err)
+			}
+
+			err = blockUntilWaitable(containers[0].Sandbox.Pid)
+			if err != nil && err != syscall.ECHILD {
+				t.Errorf("error waiting for sandbox to exit: %v", err)
+			}
+
+			// The sentry should be gone, so signaling should yield an error.
+			if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil {
+				t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID)
+			}
+
+			if err := containers[0].Destroy(); err != nil {
+				t.Errorf("failed to destroy container: %v", err)
+			}
+		})
+	}
+}
+
+// TestMultiContainerDestroy checks that container are properly cleaned-up when
+// they are destroyed.
+func TestMultiContainerDestroy(t *testing.T) {
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// First container will remain intact while the second container is killed.
+			podSpecs, ids := createSpecs(
+				[]string{"sleep", "100"},
+				[]string{app, "fork-bomb"})
+
+			// Run the fork bomb in a PID namespace to prevent processes to be
+			// re-parented to PID=1 in the root container.
+			podSpecs[1].Linux = &specs.Linux{
+				Namespaces: []specs.LinuxNamespace{{Type: "pid"}},
+			}
+			containers, cleanup, err := startContainers(conf, podSpecs, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			// Exec more processes to ensure signal all works for exec'd processes too.
+			args := &control.ExecArgs{
+				Filename: app,
+				Argv:     []string{app, "fork-bomb"},
+			}
+			if _, err := containers[1].Execute(args); err != nil {
+				t.Fatalf("error exec'ing: %v", err)
+			}
+
+			// Let it brew...
+			time.Sleep(500 * time.Millisecond)
+
+			if err := containers[1].Destroy(); err != nil {
+				t.Fatalf("error destroying container: %v", err)
+			}
+
+			// Check that destroy killed all processes belonging to the container and
+			// waited for them to exit before returning.
+			pss, err := containers[0].Sandbox.Processes("")
+			if err != nil {
+				t.Fatalf("error getting process data from sandbox: %v", err)
+			}
+			expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+			if r, err := procListsEqual(pss, expectedPL); !r {
+				t.Errorf("container got process list: %s, want: %s: error: %v",
+					procListToString(pss), procListToString(expectedPL), err)
+			}
+
+			// Check that cont.Destroy is safe to call multiple times.
+			if err := containers[1].Destroy(); err != nil {
+				t.Errorf("error destroying container: %v", err)
+			}
+		})
+	}
+}
+
+func TestMultiContainerProcesses(t *testing.T) {
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
+	// will just execve into 'sleep' and both containers will look the
+	// same.
+	specs, ids := createSpecs(
+		[]string{"sleep", "100"},
+		[]string{"sh", "-c", "{ sleep 100; }"})
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Check root's container process list doesn't include other containers.
+	expectedPL0 := []*control.Process{
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+	}
+	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+
+	// Same for the other container.
+	expectedPL1 := []*control.Process{
+		{PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
+		{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
+	}
+	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+
+	// Now exec into the second container and verify it shows up in the container.
+	args := &control.ExecArgs{
+		Filename: "/bin/sleep",
+		Argv:     []string{"/bin/sleep", "100"},
+	}
+	if _, err := containers[1].Execute(args); err != nil {
+		t.Fatalf("error exec'ing: %v", err)
+	}
+	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
+	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+	// Root container should remain unchanged.
+	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
+		t.Errorf("failed to wait for process to start: %v", err)
+	}
+}
+
+// TestMultiContainerKillAll checks that all process that belong to a container
+// are killed when SIGKILL is sent to *all* processes in that container.
+func TestMultiContainerKillAll(t *testing.T) {
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	for _, tc := range []struct {
+		killContainer bool
+	}{
+		{killContainer: true},
+		{killContainer: false},
+	} {
+		app, err := testutil.FindFile("test/cmd/test_app/test_app")
+		if err != nil {
+			t.Fatal("error finding test_app:", err)
+		}
+
+		// First container will remain intact while the second container is killed.
+		specs, ids := createSpecs(
+			[]string{app, "task-tree", "--depth=2", "--width=2"},
+			[]string{app, "task-tree", "--depth=4", "--width=2"})
+		containers, cleanup, err := startContainers(conf, specs, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+
+		// Wait until all processes are created.
+		rootProcCount := int(math.Pow(2, 3) - 1)
+		if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
+			t.Fatalf("error waitting for processes: %v", err)
+		}
+		procCount := int(math.Pow(2, 5) - 1)
+		if err := waitForProcessCount(containers[1], procCount); err != nil {
+			t.Fatalf("error waiting for processes: %v", err)
+		}
+
+		// Exec more processes to ensure signal works for exec'd processes too.
+		args := &control.ExecArgs{
+			Filename: app,
+			Argv:     []string{app, "task-tree", "--depth=2", "--width=2"},
+		}
+		if _, err := containers[1].Execute(args); err != nil {
+			t.Fatalf("error exec'ing: %v", err)
+		}
+		// Wait for these new processes to start.
+		procCount += int(math.Pow(2, 3) - 1)
+		if err := waitForProcessCount(containers[1], procCount); err != nil {
+			t.Fatalf("error waiting for processes: %v", err)
+		}
+
+		if tc.killContainer {
+			// First kill the init process to make the container be stopped with
+			// processes still running inside.
+			containers[1].SignalContainer(syscall.SIGKILL, false)
+			op := func() error {
+				c, err := Load(conf.RootDir, ids[1])
+				if err != nil {
+					return err
+				}
+				if c.Status != Stopped {
+					return fmt.Errorf("container is not stopped")
+				}
+				return nil
+			}
+			if err := testutil.Poll(op, 5*time.Second); err != nil {
+				t.Fatalf("container did not stop %q: %v", containers[1].ID, err)
+			}
+		}
+
+		c, err := Load(conf.RootDir, ids[1])
+		if err != nil {
+			t.Fatalf("failed to load child container %q: %v", c.ID, err)
+		}
+		// Kill'Em All
+		if err := c.SignalContainer(syscall.SIGKILL, true); err != nil {
+			t.Fatalf("failed to send SIGKILL to container %q: %v", c.ID, err)
+		}
+
+		// Check that all processes are gone.
+		if err := waitForProcessCount(containers[1], 0); err != nil {
+			t.Fatalf("error waiting for processes: %v", err)
+		}
+		// Check that root container was not affected.
+		if err := waitForProcessCount(containers[0], rootProcCount); err != nil {
+			t.Fatalf("error waiting for processes: %v", err)
+		}
+	}
+}
+
+func TestMultiContainerDestroyNotStarted(t *testing.T) {
+	specs, ids := createSpecs(
+		[]string{"/bin/sleep", "100"},
+		[]string{"/bin/sleep", "100"})
+
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	rootArgs := Args{
+		ID:        ids[0],
+		Spec:      specs[0],
+		BundleDir: bundleDir,
+	}
+	root, err := New(conf, rootArgs)
+	if err != nil {
+		t.Fatalf("error creating root container: %v", err)
+	}
+	defer root.Destroy()
+	if err := root.Start(conf); err != nil {
+		t.Fatalf("error starting root container: %v", err)
+	}
+
+	// Create and destroy sub-container.
+	bundleDir, cleanupSub, err := testutil.SetupBundleDir(specs[1])
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanupSub()
+
+	args := Args{
+		ID:        ids[1],
+		Spec:      specs[1],
+		BundleDir: bundleDir,
+	}
+	cont, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+
+	// Check that container can be destroyed.
+	if err := cont.Destroy(); err != nil {
+		t.Fatalf("deleting non-started container failed: %v", err)
+	}
+}
+
+// TestMultiContainerDestroyStarting attempts to force a race between start
+// and destroy.
+func TestMultiContainerDestroyStarting(t *testing.T) {
+	cmds := make([][]string, 10)
+	for i := range cmds {
+		cmds[i] = []string{"/bin/sleep", "100"}
+	}
+	specs, ids := createSpecs(cmds...)
+
+	conf := testutil.TestConfig(t)
+	rootDir, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	rootArgs := Args{
+		ID:        ids[0],
+		Spec:      specs[0],
+		BundleDir: bundleDir,
+	}
+	root, err := New(conf, rootArgs)
+	if err != nil {
+		t.Fatalf("error creating root container: %v", err)
+	}
+	defer root.Destroy()
+	if err := root.Start(conf); err != nil {
+		t.Fatalf("error starting root container: %v", err)
+	}
+
+	wg := sync.WaitGroup{}
+	for i := range cmds {
+		if i == 0 {
+			continue // skip root container
+		}
+
+		bundleDir, cleanup, err := testutil.SetupBundleDir(specs[i])
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer cleanup()
+
+		rootArgs := Args{
+			ID:        ids[i],
+			Spec:      specs[i],
+			BundleDir: bundleDir,
+		}
+		cont, err := New(conf, rootArgs)
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+
+		// Container is not thread safe, so load another instance to run in
+		// concurrently.
+		startCont, err := Load(rootDir, ids[i])
+		if err != nil {
+			t.Fatalf("error loading container: %v", err)
+		}
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			startCont.Start(conf) // ignore failures, start can fail if destroy runs first.
+		}()
+
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			if err := cont.Destroy(); err != nil {
+				t.Errorf("deleting non-started container failed: %v", err)
+			}
+		}()
+	}
+	wg.Wait()
+}
+
+// TestMultiContainerDifferentFilesystems tests that different containers have
+// different root filesystems.
+func TestMultiContainerDifferentFilesystems(t *testing.T) {
+	filename := "/foo"
+	// Root container will create file and then sleep.
+	cmdRoot := []string{"sh", "-c", fmt.Sprintf("touch %q && sleep 100", filename)}
+
+	// Child containers will assert that the file does not exist, and will
+	// then create it.
+	script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename)
+	cmd := []string{"sh", "-c", script}
+
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	// Make sure overlay is enabled, and none of the root filesystems are
+	// read-only, otherwise we won't be able to create the file.
+	conf.Overlay = true
+	specs, ids := createSpecs(cmdRoot, cmd, cmd)
+	for _, s := range specs {
+		s.Root.Readonly = false
+	}
+
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Both child containers should exit successfully.
+	for i, c := range containers {
+		if i == 0 {
+			// Don't wait on the root.
+			continue
+		}
+		if ws, err := c.Wait(); err != nil {
+			t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err)
+		} else if es := ws.ExitStatus(); es != 0 {
+			t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
+		}
+	}
+}
+
+// TestMultiContainerContainerDestroyStress tests that IO operations continue
+// to work after containers have been stopped and gofers killed.
+func TestMultiContainerContainerDestroyStress(t *testing.T) {
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	// Setup containers. Root container just reaps children, while the others
+	// perform some IOs. Children are executed in 3 batches of 10. Within the
+	// batch there is overlap between containers starting and being destroyed. In
+	// between batches all containers stop before starting another batch.
+	cmds := [][]string{{app, "reaper"}}
+	const batchSize = 10
+	for i := 0; i < 3*batchSize; i++ {
+		dir, err := ioutil.TempDir(testutil.TmpDir(), "gofer-stop-test")
+		if err != nil {
+			t.Fatal("ioutil.TempDir failed:", err)
+		}
+		defer os.RemoveAll(dir)
+
+		cmd := "find /bin -type f | head | xargs -I SRC cp SRC " + dir
+		cmds = append(cmds, []string{"sh", "-c", cmd})
+	}
+	allSpecs, allIDs := createSpecs(cmds...)
+
+	// Split up the specs and IDs.
+	rootSpec := allSpecs[0]
+	rootID := allIDs[0]
+	childrenSpecs := allSpecs[1:]
+	childrenIDs := allIDs[1:]
+
+	conf := testutil.TestConfig(t)
+	_, bundleDir, cleanup, err := testutil.SetupContainer(rootSpec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Start root container.
+	rootArgs := Args{
+		ID:        rootID,
+		Spec:      rootSpec,
+		BundleDir: bundleDir,
+	}
+	root, err := New(conf, rootArgs)
+	if err != nil {
+		t.Fatalf("error creating root container: %v", err)
+	}
+	if err := root.Start(conf); err != nil {
+		t.Fatalf("error starting root container: %v", err)
+	}
+	defer root.Destroy()
+
+	// Run batches. Each batch starts containers in parallel, then wait and
+	// destroy them before starting another batch.
+	for i := 0; i < len(childrenSpecs); i += batchSize {
+		t.Logf("Starting batch from %d to %d", i, i+batchSize)
+		specs := childrenSpecs[i : i+batchSize]
+		ids := childrenIDs[i : i+batchSize]
+
+		var children []*Container
+		for j, spec := range specs {
+			bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			args := Args{
+				ID:        ids[j],
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			child, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			children = append(children, child)
+
+			if err := child.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Give a small gap between containers.
+			time.Sleep(50 * time.Millisecond)
+		}
+		for _, child := range children {
+			ws, err := child.Wait()
+			if err != nil {
+				t.Fatalf("waiting for container: %v", err)
+			}
+			if !ws.Exited() || ws.ExitStatus() != 0 {
+				t.Fatalf("container failed, waitStatus: %x (%d)", ws, ws.ExitStatus())
+			}
+			if err := child.Destroy(); err != nil {
+				t.Fatalf("error destroying container: %v", err)
+			}
+		}
+	}
+}
+
+// Test that pod shared mounts are properly mounted in 2 containers and that
+// changes from one container is reflected in the other.
+func TestMultiContainerSharedMount(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     nil,
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+			createSharedMount(mnt0, "test-mount", podSpec...)
+
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+					desc: "directory is mounted in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+					desc: "directory is mounted in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					desc: "create file in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file appears in container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/rm", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+					desc: "file removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/mkdir", file1},
+					desc: "create directory in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", file0},
+					desc: "dir appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", file1},
+					desc: "dir appears in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/bin/rmdir", file0},
+					desc: "create directory in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-d", file0},
+					desc: "dir removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-d", file1},
+					desc: "dir removed from container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
+	}
+}
+
+// Test that pod mounts are mounted as readonly when requested.
+func TestMultiContainerSharedMountReadonly(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     []string{"ro"},
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+			createSharedMount(mnt0, "test-mount", podSpec...)
+
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+					desc: "directory is mounted in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+					desc: "directory is mounted in container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					want: 1,
+					desc: "fails to write to container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/touch", file1},
+					want: 1,
+					desc: "fails to write to container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
+	}
+}
+
+// Test that shared pod mounts continue to work after container is restarted.
+func TestMultiContainerSharedMountRestart(t *testing.T) {
+	for name, conf := range configs(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			podSpec, ids := createSpecs(sleep, sleep)
+			mnt0 := specs.Mount{
+				Destination: "/mydir/test",
+				Source:      "/some/dir",
+				Type:        "tmpfs",
+				Options:     nil,
+			}
+			podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+			mnt1 := mnt0
+			mnt1.Destination = "/mydir2/test2"
+			podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+			createSharedMount(mnt0, "test-mount", podSpec...)
+
+			containers, cleanup, err := startContainers(conf, podSpec, ids)
+			if err != nil {
+				t.Fatalf("error starting containers: %v", err)
+			}
+			defer cleanup()
+
+			file0 := path.Join(mnt0.Destination, "abc")
+			file1 := path.Join(mnt1.Destination, "abc")
+			execs := []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/touch", file0},
+					desc: "create file in container0",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file appears in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file appears in container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+
+			containers[1].Destroy()
+
+			bundleDir, cleanup, err := testutil.SetupBundleDir(podSpec[1])
+			if err != nil {
+				t.Fatalf("error restarting container: %v", err)
+			}
+			defer cleanup()
+
+			args := Args{
+				ID:        ids[1],
+				Spec:      podSpec[1],
+				BundleDir: bundleDir,
+			}
+			containers[1], err = New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			if err := containers[1].Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			execs = []execDesc{
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "-f", file0},
+					desc: "file is still in container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "-f", file1},
+					desc: "file is still in container1",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/bin/rm", file1},
+					desc: "file removed from container1",
+				},
+				{
+					c:    containers[0],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+					desc: "file removed from container0",
+				},
+				{
+					c:    containers[1],
+					cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+					desc: "file removed from container1",
+				},
+			}
+			if err := execMany(execs); err != nil {
+				t.Fatal(err.Error())
+			}
+		})
+	}
+}
+
+// Test that unsupported pod mounts options are ignored when matching master and
+// slave mounts.
+func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	// Setup the containers.
+	sleep := []string{"/bin/sleep", "100"}
+	podSpec, ids := createSpecs(sleep, sleep)
+	mnt0 := specs.Mount{
+		Destination: "/mydir/test",
+		Source:      "/some/dir",
+		Type:        "tmpfs",
+		Options:     []string{"rw", "relatime"},
+	}
+	podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+	mnt1 := mnt0
+	mnt1.Destination = "/mydir2/test2"
+	mnt1.Options = []string{"rw", "nosuid"}
+	podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+	createSharedMount(mnt0, "test-mount", podSpec...)
+
+	containers, cleanup, err := startContainers(conf, podSpec, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	execs := []execDesc{
+		{
+			c:    containers[0],
+			cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+			desc: "directory is mounted in container0",
+		},
+		{
+			c:    containers[1],
+			cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+			desc: "directory is mounted in container1",
+		},
+	}
+	if err := execMany(execs); err != nil {
+		t.Fatal(err.Error())
+	}
+}
+
+// Test that one container can send an FD to another container, even though
+// they have distinct MountNamespaces.
+func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
+	app, err := testutil.FindFile("test/cmd/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	// We set up two containers with one shared mount that is used for a
+	// shared socket. The first container will send an FD over the socket
+	// to the second container. The FD corresponds to a file in the first
+	// container's mount namespace that is not part of the second
+	// container's mount namespace. However, the second container still
+	// should be able to read the FD.
+
+	// Create a shared mount where we will put the socket.
+	sharedMnt := specs.Mount{
+		Destination: "/mydir/test",
+		Type:        "tmpfs",
+		// Shared mounts need a Source, even for tmpfs. It is only used
+		// to match up different shared mounts inside the pod.
+		Source: "/some/dir",
+	}
+	socketPath := filepath.Join(sharedMnt.Destination, "socket")
+
+	// Create a writeable tmpfs mount where the FD sender app will create
+	// files to send. This will only be mounted in the FD sender.
+	writeableMnt := specs.Mount{
+		Destination: "/tmp",
+		Type:        "tmpfs",
+	}
+
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	// Create the specs.
+	specs, ids := createSpecs(
+		[]string{"sleep", "1000"},
+		[]string{app, "fd_sender", "--socket", socketPath},
+		[]string{app, "fd_receiver", "--socket", socketPath},
+	)
+	createSharedMount(sharedMnt, "shared-mount", specs...)
+	specs[1].Mounts = append(specs[2].Mounts, sharedMnt, writeableMnt)
+	specs[2].Mounts = append(specs[1].Mounts, sharedMnt)
+
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Both containers should exit successfully.
+	for _, c := range containers[1:] {
+		if ws, err := c.Wait(); err != nil {
+			t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err)
+		} else if es := ws.ExitStatus(); es != 0 {
+			t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
+		}
+	}
+}
+
+// Test that container is destroyed when Gofer is killed.
+func TestMultiContainerGoferKilled(t *testing.T) {
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	sleep := []string{"sleep", "100"}
+	specs, ids := createSpecs(sleep, sleep, sleep)
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Ensure container is running
+	c := containers[2]
+	expectedPL := []*control.Process{
+		{PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+
+	// Kill container's gofer.
+	if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+		t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err)
+	}
+
+	// Wait until container stops.
+	if err := waitForProcessList(c, nil); err != nil {
+		t.Errorf("Container %q was not stopped after gofer death: %v", c.ID, err)
+	}
+
+	// Check that container isn't running anymore.
+	args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+	if _, err := c.executeSync(args); err == nil {
+		t.Fatalf("Container %q was not stopped after gofer death", c.ID)
+	}
+
+	// Check that other containers are unaffected.
+	for i, c := range containers {
+		if i == 2 {
+			continue // container[2] has been killed.
+		}
+		pl := []*control.Process{
+			{PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
+		}
+		if err := waitForProcessList(c, pl); err != nil {
+			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
+		}
+		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+		if _, err := c.executeSync(args); err != nil {
+			t.Fatalf("Container %q was affected by another container: %v", c.ID, err)
+		}
+	}
+
+	// Kill root container's gofer to bring entire sandbox down.
+	c = containers[0]
+	if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+		t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err)
+	}
+
+	// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
+	// and RPC errors out.
+	impossiblePL := []*control.Process{
+		{PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
+	}
+	if err := waitForProcessList(c, impossiblePL); err == nil {
+		t.Fatalf("Sandbox was not killed after gofer death")
+	}
+
+	// Check that entire sandbox isn't running anymore.
+	for _, c := range containers {
+		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+		if _, err := c.executeSync(args); err == nil {
+			t.Fatalf("Container %q was not stopped after gofer death", c.ID)
+		}
+	}
+}
+
+func TestMultiContainerLoadSandbox(t *testing.T) {
+	sleep := []string{"sleep", "100"}
+	specs, ids := createSpecs(sleep, sleep, sleep)
+
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	// Create containers for the sandbox.
+	wants, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Then create unrelated containers.
+	for i := 0; i < 3; i++ {
+		specs, ids = createSpecs(sleep, sleep, sleep)
+		_, cleanup, err = startContainers(conf, specs, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+	}
+
+	// Create an unrelated directory under root.
+	dir := filepath.Join(conf.RootDir, "not-a-container")
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
+	}
+
+	// Create a valid but empty container directory.
+	randomCID := testutil.RandomContainerID()
+	dir = filepath.Join(conf.RootDir, randomCID)
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
+	}
+
+	// Load the sandbox and check that the correct containers were returned.
+	id := wants[0].Sandbox.ID
+	gots, err := loadSandbox(conf.RootDir, id)
+	if err != nil {
+		t.Fatalf("loadSandbox()=%v", err)
+	}
+	wantIDs := make(map[string]struct{})
+	for _, want := range wants {
+		wantIDs[want.ID] = struct{}{}
+	}
+	for _, got := range gots {
+		if got.Sandbox.ID != id {
+			t.Errorf("wrong sandbox ID, got: %v, want: %v", got.Sandbox.ID, id)
+		}
+		if _, ok := wantIDs[got.ID]; !ok {
+			t.Errorf("wrong container ID, got: %v, wants: %v", got.ID, wantIDs)
+		}
+		delete(wantIDs, got.ID)
+	}
+	if len(wantIDs) != 0 {
+		t.Errorf("containers not found: %v", wantIDs)
+	}
+}
+
+// TestMultiContainerRunNonRoot checks that child container can be configured
+// when running as non-privileged user.
+func TestMultiContainerRunNonRoot(t *testing.T) {
+	cmdRoot := []string{"/bin/sleep", "100"}
+	cmdSub := []string{"/bin/true"}
+	podSpecs, ids := createSpecs(cmdRoot, cmdSub)
+
+	// User running inside container can't list '$TMP/blocked' and would fail to
+	// mount it.
+	blocked, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed: %v", err)
+	}
+	if err := os.Chmod(blocked, 0700); err != nil {
+		t.Fatalf("os.MkDir(%q) failed: %v", blocked, err)
+	}
+	dir := path.Join(blocked, "test")
+	if err := os.Mkdir(dir, 0755); err != nil {
+		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+	}
+
+	src, err := ioutil.TempDir(testutil.TmpDir(), "src")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed: %v", err)
+	}
+
+	// Set a random user/group with no access to "blocked" dir.
+	podSpecs[1].Process.User.UID = 343
+	podSpecs[1].Process.User.GID = 2401
+	podSpecs[1].Process.Capabilities = nil
+
+	podSpecs[1].Mounts = append(podSpecs[1].Mounts, specs.Mount{
+		Destination: dir,
+		Source:      src,
+		Type:        "bind",
+	})
+
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+
+	conf := testutil.TestConfig(t)
+	conf.RootDir = rootDir
+
+	pod, cleanup, err := startContainers(conf, podSpecs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Once all containers are started, wait for the child container to exit.
+	// This means that the volume was mounted properly.
+	ws, err := pod[1].Wait()
+	if err != nil {
+		t.Fatalf("running child container: %v", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Fatalf("child container failed, waitStatus: %v", ws)
+	}
+}
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
new file mode 100644
index 000000000..bac177a88
--- /dev/null
+++ b/runsc/container/shared_volume_test.go
@@ -0,0 +1,273 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/runsc/boot"
+)
+
+// TestSharedVolume checks that modifications to a volume mount are propagated
+// into and out of the sandbox.
+func TestSharedVolume(t *testing.T) {
+	conf := testutil.TestConfig(t)
+	conf.FileAccess = boot.FileAccessShared
+
+	// Main process just sleeps. We will use "exec" to probe the state of
+	// the filesystem.
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "shared-volume-test")
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create and start the container.
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	c, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// File that will be used to check consistency inside/outside sandbox.
+	filename := filepath.Join(dir, "file")
+
+	// File does not exist yet. Reading from the sandbox should fail.
+	argsTestFile := &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", filename},
+	}
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err)
+	}
+
+	// Create the file from outside of the sandbox.
+	if err := ioutil.WriteFile(filename, []byte("foobar"), 0777); err != nil {
+		t.Fatalf("error writing to file %q: %v", filename, err)
+	}
+
+	// Now we should be able to test the file from within the sandbox.
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// Rename the file from outside of the sandbox.
+	newFilename := filepath.Join(dir, "newfile")
+	if err := os.Rename(filename, newFilename); err != nil {
+		t.Fatalf("os.Rename(%q, %q) failed: %v", filename, newFilename, err)
+	}
+
+	// File should no longer exist at the old path within the sandbox.
+	if ws, err := c.executeSync(argsTestFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", filename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus())
+	}
+
+	// We should be able to test the new filename from within the sandbox.
+	argsTestNewFile := &control.ExecArgs{
+		Filename: "/usr/bin/test",
+		Argv:     []string{"test", "-f", newFilename},
+	}
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus())
+	}
+
+	// Delete the renamed file from outside of the sandbox.
+	if err := os.Remove(newFilename); err != nil {
+		t.Fatalf("error removing file %q: %v", filename, err)
+	}
+
+	// Renamed file should no longer exist at the old path within the sandbox.
+	if ws, err := c.executeSync(argsTestNewFile); err != nil {
+		t.Fatalf("unexpected error testing file %q: %v", newFilename, err)
+	} else if ws.ExitStatus() == 0 {
+		t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus())
+	}
+
+	// Now create the file from WITHIN the sandbox.
+	argsTouch := &control.ExecArgs{
+		Filename: "/usr/bin/touch",
+		Argv:     []string{"touch", filename},
+		KUID:     auth.KUID(os.Getuid()),
+		KGID:     auth.KGID(os.Getgid()),
+	}
+	if ws, err := c.executeSync(argsTouch); err != nil {
+		t.Fatalf("unexpected error touching file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(filename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", filename, err)
+	}
+
+	// File should exist outside the sandbox.
+	if _, err := os.Stat(filename); err != nil {
+		t.Errorf("stat %q got error %v, wanted nil", filename, err)
+	}
+
+	// Delete the file from within the sandbox.
+	argsRemove := &control.ExecArgs{
+		Filename: "/bin/rm",
+		Argv:     []string{"rm", filename},
+	}
+	if ws, err := c.executeSync(argsRemove); err != nil {
+		t.Fatalf("unexpected error removing file %q: %v", filename, err)
+	} else if ws.ExitStatus() != 0 {
+		t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus())
+	}
+
+	// File should not exist outside the sandbox.
+	if _, err := os.Stat(filename); !os.IsNotExist(err) {
+		t.Errorf("stat %q got error %v, wanted ErrNotExist", filename, err)
+	}
+}
+
+func checkFile(c *Container, filename string, want []byte) error {
+	cpy := filename + ".copy"
+	argsCp := &control.ExecArgs{
+		Filename: "/bin/cp",
+		Argv:     []string{"cp", "-f", filename, cpy},
+	}
+	if _, err := c.executeSync(argsCp); err != nil {
+		return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err)
+	}
+	got, err := ioutil.ReadFile(cpy)
+	if err != nil {
+		return fmt.Errorf("Error reading file %q: %v", filename, err)
+	}
+	if !bytes.Equal(got, want) {
+		return fmt.Errorf("file content inside the sandbox is wrong, got: %q, want: %q", got, want)
+	}
+	return nil
+}
+
+// TestSharedVolumeFile tests that changes to file content outside the sandbox
+// is reflected inside.
+func TestSharedVolumeFile(t *testing.T) {
+	conf := testutil.TestConfig(t)
+	conf.FileAccess = boot.FileAccessShared
+
+	// Main process just sleeps. We will use "exec" to probe the state of
+	// the filesystem.
+	spec := testutil.NewSpecWithArgs("sleep", "1000")
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "shared-volume-test")
+	if err != nil {
+		t.Fatalf("TempDir failed: %v", err)
+	}
+
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create and start the container.
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+	}
+	c, err := New(conf, args)
+	if err != nil {
+		t.Fatalf("error creating container: %v", err)
+	}
+	defer c.Destroy()
+	if err := c.Start(conf); err != nil {
+		t.Fatalf("error starting container: %v", err)
+	}
+
+	// File that will be used to check consistency inside/outside sandbox.
+	filename := filepath.Join(dir, "file")
+
+	// Write file from outside the container and check that the same content is
+	// read inside.
+	want := []byte("host-")
+	if err := ioutil.WriteFile(filename, []byte(want), 0666); err != nil {
+		t.Fatalf("Error writing to %q: %v", filename, err)
+	}
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Append to file inside the container and check that content is not lost.
+	argsAppend := &control.ExecArgs{
+		Filename: "/bin/bash",
+		Argv:     []string{"bash", "-c", "echo -n sandbox- >> " + filename},
+	}
+	if _, err := c.executeSync(argsAppend); err != nil {
+		t.Fatalf("unexpected error appending file %q: %v", filename, err)
+	}
+	want = []byte("host-sandbox-")
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Write again from outside the container and check that the same content is
+	// read inside.
+	f, err := os.OpenFile(filename, os.O_APPEND|os.O_WRONLY, 0)
+	if err != nil {
+		t.Fatalf("Error openning file %q: %v", filename, err)
+	}
+	defer f.Close()
+	if _, err := f.Write([]byte("host")); err != nil {
+		t.Fatalf("Error writing to file %q: %v", filename, err)
+	}
+	want = []byte("host-sandbox-host")
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+
+	// Shrink file outside and check that the same content is read inside.
+	if err := f.Truncate(5); err != nil {
+		t.Fatalf("Error truncating file %q: %v", filename, err)
+	}
+	want = want[:5]
+	if err := checkFile(c, filename, want); err != nil {
+		t.Fatal(err.Error())
+	}
+}
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
new file mode 100644
index 000000000..17a251530
--- /dev/null
+++ b/runsc/container/state_file.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"github.com/gofrs/flock"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+const stateFileExtension = ".state"
+
+// StateFile handles load from/save to container state safely from multiple
+// processes. It uses a lock file to provide synchronization between operations.
+//
+// The lock file is located at: "${s.RootDir}/${s.ID}.lock".
+// The state file is located at: "${s.RootDir}/${s.ID}.state".
+type StateFile struct {
+	// RootDir is the directory containing the container metadata file.
+	RootDir string `json:"rootDir"`
+
+	// ID is the container ID.
+	ID string `json:"id"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
+	once  sync.Once
+	flock *flock.Flock
+}
+
+// List returns all container ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+	log.Debugf("List containers %q", rootDir)
+	list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension))
+	if err != nil {
+		return nil, err
+	}
+	var out []string
+	for _, path := range list {
+		// Filter out files that do no belong to a container.
+		fileName := filepath.Base(path)
+		if len(fileName) < len(stateFileExtension) {
+			panic(fmt.Sprintf("invalid file match %q", path))
+		}
+		// Remove the extension.
+		cid := fileName[:len(fileName)-len(stateFileExtension)]
+		if validateID(cid) == nil {
+			out = append(out, cid)
+		}
+	}
+	return out, nil
+}
+
+// lock globally locks all locking operations for the container.
+func (s *StateFile) lock() error {
+	s.once.Do(func() {
+		s.flock = flock.NewFlock(s.lockPath())
+	})
+
+	if err := s.flock.Lock(); err != nil {
+		return fmt.Errorf("acquiring lock on %q: %v", s.flock, err)
+	}
+	return nil
+}
+
+// lockForNew acquires the lock and checks if the state file doesn't exist. This
+// is done to ensure that more than one creation didn't race to create
+// containers with the same ID.
+func (s *StateFile) lockForNew() error {
+	if err := s.lock(); err != nil {
+		return err
+	}
+
+	// Checks if the container already exists by looking for the metadata file.
+	if _, err := os.Stat(s.statePath()); err == nil {
+		s.unlock()
+		return fmt.Errorf("container already exists")
+	} else if !os.IsNotExist(err) {
+		s.unlock()
+		return fmt.Errorf("looking for existing container: %v", err)
+	}
+	return nil
+}
+
+// unlock globally unlocks all locking operations for the container.
+func (s *StateFile) unlock() error {
+	if !s.flock.Locked() {
+		panic("unlock called without lock held")
+	}
+
+	if err := s.flock.Unlock(); err != nil {
+		log.Warningf("Error to release lock on %q: %v", s.flock, err)
+		return fmt.Errorf("releasing lock on %q: %v", s.flock, err)
+	}
+	return nil
+}
+
+// saveLocked saves 'v' to the state file.
+//
+// Preconditions: lock() must been called before.
+func (s *StateFile) saveLocked(v interface{}) error {
+	if !s.flock.Locked() {
+		panic("saveLocked called without lock held")
+	}
+
+	meta, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(s.statePath(), meta, 0640); err != nil {
+		return fmt.Errorf("writing json file: %v", err)
+	}
+	return nil
+}
+
+func (s *StateFile) load(v interface{}) error {
+	if err := s.lock(); err != nil {
+		return err
+	}
+	defer s.unlock()
+
+	metaBytes, err := ioutil.ReadFile(s.statePath())
+	if err != nil {
+		return err
+	}
+	return json.Unmarshal(metaBytes, &v)
+}
+
+func (s *StateFile) close() error {
+	if s.flock == nil {
+		return nil
+	}
+	if s.flock.Locked() {
+		panic("Closing locked file")
+	}
+	return s.flock.Close()
+}
+
+func buildStatePath(rootDir, id string) string {
+	return filepath.Join(rootDir, id+stateFileExtension)
+}
+
+// statePath is the full path to the state file.
+func (s *StateFile) statePath() string {
+	return buildStatePath(s.RootDir, s.ID)
+}
+
+// lockPath is the full path to the lock file.
+func (s *StateFile) lockPath() string {
+	return filepath.Join(s.RootDir, s.ID+".lock")
+}
+
+// destroy deletes all state created by the stateFile. It may be called with the
+// lock file held. In that case, the lock file must still be unlocked and
+// properly closed after destroy returns.
+func (s *StateFile) destroy() error {
+	if err := os.Remove(s.statePath()); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	if err := os.Remove(s.lockPath()); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
diff --git a/runsc/container/status.go b/runsc/container/status.go
new file mode 100644
index 000000000..91d9112f1
--- /dev/null
+++ b/runsc/container/status.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+// Status enumerates container statuses. The statuses and their semantics are
+// part of the runtime CLI spec.
+type Status int
+
+const (
+	// Created indicates "the runtime has finished the create operation and
+	// the container process has neither exited nor executed the
+	// user-specified program".
+	Created Status = iota
+
+	// Creating indicates "the container is being created".
+	Creating
+
+	// Paused indicates that the process within the container has been
+	// suspended.
+	Paused
+
+	// Running indicates "the container process has executed the
+	// user-specified program but has not exited".
+	Running
+
+	// Stopped indicates "the container process has exited".
+	Stopped
+)
+
+// String converts a Status to a string. These strings are part of the runtime
+// CLI spec and should not be changed.
+func (s Status) String() string {
+	switch s {
+	case Created:
+		return "created"
+	case Creating:
+		return "creating"
+	case Paused:
+		return "paused"
+	case Running:
+		return "running"
+	case Stopped:
+		return "stopped"
+	default:
+		return "unknown"
+	}
+
+}
diff --git a/runsc/debian/description b/runsc/debian/description
new file mode 100644
index 000000000..9e8e08805
--- /dev/null
+++ b/runsc/debian/description
@@ -0,0 +1 @@
+gVisor container sandbox runtime
diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh
new file mode 100755
index 000000000..dc7aeee87
--- /dev/null
+++ b/runsc/debian/postinst.sh
@@ -0,0 +1,24 @@
+#!/bin/sh -e
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ "$1" != configure ]; then
+  exit 0
+fi
+
+if [ -f /etc/docker/daemon.json ]; then
+  runsc install
+  systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2
+fi
diff --git a/runsc/flag/BUILD b/runsc/flag/BUILD
new file mode 100644
index 000000000..5cb7604a8
--- /dev/null
+++ b/runsc/flag/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "flag",
+    srcs = ["flag.go"],
+    visibility = ["//:sandbox"],
+)
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
new file mode 100644
index 000000000..0ca4829d7
--- /dev/null
+++ b/runsc/flag/flag.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package flag
+
+import (
+	"flag"
+)
+
+type FlagSet = flag.FlagSet
+
+var (
+	NewFlagSet  = flag.NewFlagSet
+	String      = flag.String
+	Bool        = flag.Bool
+	Int         = flag.Int
+	Uint        = flag.Uint
+	CommandLine = flag.CommandLine
+	Parse       = flag.Parse
+)
+
+const ContinueOnError = flag.ContinueOnError
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
new file mode 100644
index 000000000..64a406ae2
--- /dev/null
+++ b/runsc/fsgofer/BUILD
@@ -0,0 +1,35 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "fsgofer",
+    srcs = [
+        "fsgofer.go",
+        "fsgofer_amd64_unsafe.go",
+        "fsgofer_arm64_unsafe.go",
+        "fsgofer_unsafe.go",
+    ],
+    visibility = ["//runsc:__subpackages__"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fd",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/sync",
+        "//pkg/syserr",
+        "//runsc/specutils",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "fsgofer_test",
+    size = "small",
+    srcs = ["fsgofer_test.go"],
+    library = ":fsgofer",
+    deps = [
+        "//pkg/log",
+        "//pkg/p9",
+    ],
+)
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
new file mode 100644
index 000000000..82b48ef32
--- /dev/null
+++ b/runsc/fsgofer/filter/BUILD
@@ -0,0 +1,26 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "filter",
+    srcs = [
+        "config.go",
+        "config_amd64.go",
+        "config_arm64.go",
+        "extra_filters.go",
+        "extra_filters_msan.go",
+        "extra_filters_race.go",
+        "filter.go",
+    ],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/flipcall",
+        "//pkg/log",
+        "//pkg/seccomp",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
new file mode 100644
index 000000000..1dce36965
--- /dev/null
+++ b/runsc/fsgofer/filter/config.go
@@ -0,0 +1,249 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"os"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// allowedSyscalls is the set of syscalls executed by the gofer.
+var allowedSyscalls = seccomp.SyscallRules{
+	syscall.SYS_ACCEPT:        {},
+	syscall.SYS_CLOCK_GETTIME: {},
+	syscall.SYS_CLONE: []seccomp.Rule{
+		{
+			seccomp.AllowValue(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+		},
+	},
+	syscall.SYS_CLOSE:     {},
+	syscall.SYS_DUP:       {},
+	syscall.SYS_EPOLL_CTL: {},
+	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EVENTFD2: []seccomp.Rule{
+		{
+			seccomp.AllowValue(0),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EXIT:       {},
+	syscall.SYS_EXIT_GROUP: {},
+	syscall.SYS_FALLOCATE: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_FCHMOD:   {},
+	syscall.SYS_FCHOWNAT: {},
+	syscall.SYS_FCNTL: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFL),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_SETFL),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFD),
+		},
+		// Used by flipcall.PacketWindowAllocator.Init().
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(unix.F_ADD_SEALS),
+		},
+	},
+	syscall.SYS_FSTAT:     {},
+	syscall.SYS_FSTATFS:   {},
+	syscall.SYS_FSYNC:     {},
+	syscall.SYS_FTRUNCATE: {},
+	syscall.SYS_FUTEX: {
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+		// Non-private futex used for flipcall.
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+	},
+	syscall.SYS_GETDENTS64:   {},
+	syscall.SYS_GETPID:       {},
+	unix.SYS_GETRANDOM:       {},
+	syscall.SYS_GETTID:       {},
+	syscall.SYS_GETTIMEOFDAY: {},
+	syscall.SYS_LINKAT:       {},
+	syscall.SYS_LSEEK:        {},
+	syscall.SYS_MADVISE:      {},
+	unix.SYS_MEMFD_CREATE:    {}, /// Used by flipcall.PacketWindowAllocator.Init().
+	syscall.SYS_MKDIRAT:      {},
+	// Used by the Go runtime as a temporarily workaround for a Linux
+	// 5.2-5.4 bug.
+	//
+	// See src/runtime/os_linux_x86.go.
+	//
+	// TODO(b/148688965): Remove once this is gone from Go.
+	syscall.SYS_MLOCK: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(4096),
+		},
+	},
+	syscall.SYS_MMAP: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_SHARED),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+		},
+	},
+	syscall.SYS_MPROTECT:   {},
+	syscall.SYS_MUNMAP:     {},
+	syscall.SYS_NANOSLEEP:  {},
+	syscall.SYS_OPENAT:     {},
+	syscall.SYS_PPOLL:      {},
+	syscall.SYS_PREAD64:    {},
+	syscall.SYS_PWRITE64:   {},
+	syscall.SYS_READ:       {},
+	syscall.SYS_READLINKAT: {},
+	syscall.SYS_RECVMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+		},
+	},
+	syscall.SYS_RENAMEAT:        {},
+	syscall.SYS_RESTART_SYSCALL: {},
+	syscall.SYS_RT_SIGPROCMASK:  {},
+	syscall.SYS_RT_SIGRETURN:    {},
+	syscall.SYS_SCHED_YIELD:     {},
+	syscall.SYS_SENDMSG: []seccomp.Rule{
+		// Used by fdchannel.Endpoint.SendFD().
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+		// Used by unet.SocketWriter.WriteVec().
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+		},
+	},
+	syscall.SYS_SHUTDOWN: []seccomp.Rule{
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+	},
+	syscall.SYS_SIGALTSTACK: {},
+	// Used by fdchannel.NewConnectedSockets().
+	syscall.SYS_SOCKETPAIR: {
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_SYMLINKAT: {},
+	syscall.SYS_TGKILL: []seccomp.Rule{
+		{
+			seccomp.AllowValue(uint64(os.Getpid())),
+		},
+	},
+	syscall.SYS_UNLINKAT:  {},
+	syscall.SYS_UTIMENSAT: {},
+	syscall.SYS_WRITE:     {},
+}
+
+var udsSyscalls = seccomp.SyscallRules{
+	syscall.SYS_SOCKET: []seccomp.Rule{
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_STREAM),
+			seccomp.AllowValue(0),
+		},
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_DGRAM),
+			seccomp.AllowValue(0),
+		},
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_SEQPACKET),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_CONNECT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+		},
+	},
+}
diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go
new file mode 100644
index 000000000..a4b28cb8b
--- /dev/null
+++ b/runsc/fsgofer/filter/config_amd64.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+		{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{}
+}
diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go
new file mode 100644
index 000000000..d2697deb7
--- /dev/null
+++ b/runsc/fsgofer/filter/config_arm64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{}
+}
diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go
new file mode 100644
index 000000000..e28d4b8d6
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+import (
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go instrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() seccomp.SyscallRules {
+	return nil
+}
diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go
new file mode 100644
index 000000000..8c6179c8f
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters_msan.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+	log.Warningf("*** SECCOMP WARNING: MSAN is enabled: syscall filters less restrictive!")
+	return seccomp.SyscallRules{
+		syscall.SYS_SCHED_GETAFFINITY: {},
+		syscall.SYS_SET_ROBUST_LIST:   {},
+	}
+}
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
new file mode 100644
index 000000000..885c92f7a
--- /dev/null
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+	log.Warningf("*** SECCOMP WARNING: TSAN is enabled: syscall filters less restrictive!")
+	return seccomp.SyscallRules{
+		syscall.SYS_BRK:             {},
+		syscall.SYS_CLONE:           {},
+		syscall.SYS_FUTEX:           {},
+		syscall.SYS_MADVISE:         {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_MUNLOCK:         {},
+		syscall.SYS_NANOSLEEP:       {},
+		syscall.SYS_OPEN:            {},
+		syscall.SYS_SET_ROBUST_LIST: {},
+		// Used within glibc's malloc.
+		syscall.SYS_TIME: {},
+	}
+}
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
new file mode 100644
index 000000000..289886720
--- /dev/null
+++ b/runsc/fsgofer/filter/filter.go
@@ -0,0 +1,38 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the gofer is allowed to make, and
+// installs seccomp filters to prevent prohibited syscalls in case it's
+// compromised.
+package filter
+
+import (
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// Install installs seccomp filters.
+func Install() error {
+	// Set of additional filters used by -race and -msan. Returns empty
+	// when not enabled.
+	allowedSyscalls.Merge(instrumentationFilters())
+
+	return seccomp.Install(allowedSyscalls)
+}
+
+// InstallUDSFilters extends the allowed syscalls to include those necessary for
+// connecting to a host UDS.
+func InstallUDSFilters() {
+	// Add additional filters required for connecting to the host's sockets.
+	allowedSyscalls.Merge(udsSyscalls)
+}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
new file mode 100644
index 000000000..1942f50d7
--- /dev/null
+++ b/runsc/fsgofer/fsgofer.go
@@ -0,0 +1,1149 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsgofer implements p9.File giving access to local files using
+// a simple mapping from a path prefix that is added to the path requested
+// by the sandbox. Ex:
+//
+//   prefix: "/docker/imgs/alpine"
+//   app path: /bin/ls => /docker/imgs/alpine/bin/ls
+package fsgofer
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"path"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+const (
+	// invalidMode is set to a value that doesn't match any other valid
+	// modes to ensure an unopened/closed file fails all mode checks.
+	invalidMode = p9.OpenFlags(math.MaxUint32)
+
+	openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
+)
+
+type fileType int
+
+const (
+	regular fileType = iota
+	directory
+	symlink
+	socket
+	unknown
+)
+
+// String implements fmt.Stringer.
+func (f fileType) String() string {
+	switch f {
+	case regular:
+		return "regular"
+	case directory:
+		return "directory"
+	case symlink:
+		return "symlink"
+	case socket:
+		return "socket"
+	}
+	return "unknown"
+}
+
+// ControlSocketAddr generates an abstract unix socket name for the given id.
+func ControlSocketAddr(id string) string {
+	return fmt.Sprintf("\x00runsc-gofer.%s", id)
+}
+
+// Config sets configuration options for each attach point.
+type Config struct {
+	// ROMount is set to true if this is a readonly mount.
+	ROMount bool
+
+	// PanicOnWrite panics on attempts to write to RO mounts.
+	PanicOnWrite bool
+
+	// HostUDS signals whether the gofer can mount a host's UDS.
+	HostUDS bool
+}
+
+type attachPoint struct {
+	prefix string
+	conf   Config
+
+	// attachedMu protects attached.
+	attachedMu sync.Mutex
+	attached   bool
+
+	// deviceMu protects devices and nextDevice.
+	deviceMu sync.Mutex
+
+	// nextDevice is the next device id that will be allocated.
+	nextDevice uint8
+
+	// devices is a map from actual host devices to "small" integers that
+	// can be combined with host inode to form a unique virtual inode id.
+	devices map[uint64]uint8
+}
+
+// NewAttachPoint creates a new attacher that gives local file
+// access to all files under 'prefix'. 'prefix' must be an absolute path.
+func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) {
+	// Sanity check the prefix.
+	if !filepath.IsAbs(prefix) {
+		return nil, fmt.Errorf("attach point prefix must be absolute %q", prefix)
+	}
+	return &attachPoint{
+		prefix:  prefix,
+		conf:    c,
+		devices: make(map[uint64]uint8),
+	}, nil
+}
+
+// Attach implements p9.Attacher.
+func (a *attachPoint) Attach() (p9.File, error) {
+	a.attachedMu.Lock()
+	defer a.attachedMu.Unlock()
+
+	if a.attached {
+		return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
+	}
+
+	f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
+		return fd.Open(a.prefix, openFlags|mode, 0)
+	})
+	if err != nil {
+		return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err)
+	}
+
+	stat, err := stat(f.FD())
+	if err != nil {
+		return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
+	}
+
+	lf, err := newLocalFile(a, f, a.prefix, stat)
+	if err != nil {
+		return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err)
+	}
+	a.attached = true
+	return lf, nil
+}
+
+// makeQID returns a unique QID for the given stat buffer.
+func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
+	a.deviceMu.Lock()
+	defer a.deviceMu.Unlock()
+
+	// First map the host device id to a unique 8-bit integer.
+	dev, ok := a.devices[stat.Dev]
+	if !ok {
+		a.devices[stat.Dev] = a.nextDevice
+		dev = a.nextDevice
+		a.nextDevice++
+		if a.nextDevice < dev {
+			panic(fmt.Sprintf("device id overflow! map: %+v", a.devices))
+		}
+	}
+
+	// Construct a "virtual" inode id with the uint8 device number in the
+	// first 8 bits, and the rest of the bits from the host inode id.
+	maskedIno := stat.Ino & 0x00ffffffffffffff
+	if maskedIno != stat.Ino {
+		log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino)
+	}
+	ino := uint64(dev)<<56 | maskedIno
+	log.Debugf("host inode %x on device %x mapped to virtual inode %x", stat.Ino, stat.Dev, ino)
+
+	return p9.QID{
+		Type: p9.FileMode(stat.Mode).QIDType(),
+		Path: ino,
+	}
+}
+
+// localFile implements p9.File wrapping a local file. The underlying file
+// is opened during Walk() and stored in 'file' to be used with other
+// operations. The file is opened as readonly, unless it's a symlink or there is
+// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is
+// called to clone the file. This reduces the number of walks that need to be
+// done by the host file system when files are reused.
+//
+// The file may be reopened if the requested mode in Open() is not a subset of
+// current mode. Consequently, 'file' could have a mode wider than requested and
+// must be verified before read/write operations. Before the file is opened and
+// after it's closed, 'mode' is set to an invalid value to prevent an unopened
+// file from being used.
+//
+// The reason that the file is not opened initially as read-write is for better
+// performance with 'overlay2' storage driver. overlay2 eagerly copies the
+// entire file up when it's opened in write mode, and would perform badly when
+// multiple files are only being opened for read (esp. startup).
+type localFile struct {
+	p9.DefaultWalkGetAttr
+
+	// attachPoint is the attachPoint that serves this localFile.
+	attachPoint *attachPoint
+
+	// hostPath will be safely updated by the Renamed hook.
+	hostPath string
+
+	// file is opened when localFile is created and it's never nil. It may be
+	// reopened if the Open() mode is wider than the mode the file was originally
+	// opened with.
+	file *fd.FD
+
+	// mode is the mode in which the file was opened. Set to invalidMode
+	// if localFile isn't opened.
+	mode p9.OpenFlags
+
+	// ft is the fileType for this file.
+	ft fileType
+
+	// readDirMu protects against concurrent Readdir calls.
+	readDirMu sync.Mutex
+
+	// lastDirentOffset is the last offset returned by Readdir(). If another call
+	// to Readdir is made at the same offset, the file doesn't need to be
+	// repositioned. This is an important optimization because the caller must
+	// always make one extra call to detect EOF (empty result, no error).
+	lastDirentOffset uint64
+}
+
+var procSelfFD *fd.FD
+
+// OpenProcSelfFD opens the /proc/self/fd directory, which will be used to
+// reopen file descriptors.
+func OpenProcSelfFD() error {
+	d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0)
+	if err != nil {
+		return fmt.Errorf("error opening /proc/self/fd: %v", err)
+	}
+	procSelfFD = fd.New(d)
+	return nil
+}
+
+func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
+	d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	return fd.New(d), nil
+}
+
+func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) {
+	path := path.Join(parent.hostPath, name)
+	f, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+		return fd.OpenAt(parent.file, name, openFlags|mode, 0)
+	})
+	return f, path, err
+}
+
+// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
+// to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
+// actual file open and is customizable by the caller.
+func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
+	// Attempt to open file in the following mode in order:
+	//   1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
+	//      Use non-blocking to prevent getting stuck inside open(2) for
+	//      FIFOs. This option has no effect on regular files.
+	//   2. PATH: for symlinks, sockets.
+	modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
+
+	var err error
+	var file *fd.FD
+	for i, mode := range modes {
+		file, err = fn(mode)
+		if err == nil {
+			// openat succeeded, we're done.
+			break
+		}
+		switch e := extractErrno(err); e {
+		case syscall.ENOENT:
+			// File doesn't exist, no point in retrying.
+			return nil, e
+		}
+		// openat failed. Try again with next mode, preserving 'err' in case this
+		// was the last attempt.
+		log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err)
+	}
+	if err != nil {
+		// All attempts to open file have failed, return the last error.
+		log.Debugf("Failed to open file, path: %q, err: %v", path, err)
+		return nil, extractErrno(err)
+	}
+
+	return file, nil
+}
+
+func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) {
+	var ft fileType
+	switch stat.Mode & syscall.S_IFMT {
+	case syscall.S_IFREG:
+		ft = regular
+	case syscall.S_IFDIR:
+		ft = directory
+	case syscall.S_IFLNK:
+		ft = symlink
+	case syscall.S_IFSOCK:
+		if !permitSocket {
+			return unknown, syscall.EPERM
+		}
+		ft = socket
+	default:
+		return unknown, syscall.EPERM
+	}
+	return ft, nil
+}
+
+func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
+	ft, err := getSupportedFileType(stat, a.conf.HostUDS)
+	if err != nil {
+		return nil, err
+	}
+
+	return &localFile{
+		attachPoint: a,
+		hostPath:    path,
+		file:        file,
+		mode:        invalidMode,
+		ft:          ft,
+	}, nil
+}
+
+// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as
+// non-blocking. If anything fails, returns nil. It's better to have a file
+// without host FD, than to fail the operation.
+func newFDMaybe(file *fd.FD) *fd.FD {
+	dupFD, err := syscall.Dup(file.FD())
+	// Technically, the runtime may call the finalizer on file as soon as
+	// FD() returns.
+	runtime.KeepAlive(file)
+	if err != nil {
+		return nil
+	}
+	dup := fd.New(dupFD)
+
+	// fd is blocking; non-blocking is required.
+	if err := syscall.SetNonblock(dup.FD(), true); err != nil {
+		dup.Close()
+		return nil
+	}
+	return dup
+}
+
+func stat(fd int) (syscall.Stat_t, error) {
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(fd, &stat); err != nil {
+		return syscall.Stat_t{}, err
+	}
+	return stat, nil
+}
+
+func fchown(fd int, uid p9.UID, gid p9.GID) error {
+	return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+}
+
+// Open implements p9.File.
+func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+	if l.isOpen() {
+		panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
+	}
+
+	// Check if control file can be used or if a new open must be created.
+	var newFile *fd.FD
+	if flags == p9.ReadOnly {
+		log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath)
+		newFile = l.file
+	} else {
+		// Ideally reopen would call name_to_handle_at (with empty name) and
+		// open_by_handle_at to reopen the file without using 'hostPath'. However,
+		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
+		log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath)
+		var err error
+		// Constrain open flags to the open mode and O_TRUNC.
+		newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC)))
+		if err != nil {
+			return nil, p9.QID{}, 0, extractErrno(err)
+		}
+	}
+
+	stat, err := stat(newFile.FD())
+	if err != nil {
+		if newFile != l.file {
+			newFile.Close()
+		}
+		return nil, p9.QID{}, 0, extractErrno(err)
+	}
+
+	var fd *fd.FD
+	if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+		// Donate FD for regular files only.
+		fd = newFDMaybe(newFile)
+	}
+
+	// Close old file in case a new one was created.
+	if newFile != l.file {
+		if err := l.file.Close(); err != nil {
+			log.Warningf("Error closing file %q: %v", l.hostPath, err)
+		}
+		l.file = newFile
+	}
+	l.mode = flags & p9.OpenFlagsModeMask
+	return fd, l.attachPoint.makeQID(stat), 0, nil
+}
+
+// Create implements p9.File.
+func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
+		return nil, nil, p9.QID{}, 0, syscall.EBADF
+	}
+
+	// 'file' may be used for other operations (e.g. Walk), so read access is
+	// always added to flags. Note that resulting file might have a wider mode
+	// than needed for each particular case.
+	flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+	if mode == p9.WriteOnly {
+		flags |= syscall.O_RDWR
+	} else {
+		flags |= mode.OSFlags()
+	}
+
+	child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions()))
+	if err != nil {
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+	cu := specutils.MakeCleanup(func() {
+		child.Close()
+		// Best effort attempt to remove the file in case of failure.
+		if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
+			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
+		}
+	})
+	defer cu.Clean()
+
+	if err := fchown(child.FD(), uid, gid); err != nil {
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+	stat, err := stat(child.FD())
+	if err != nil {
+		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	}
+
+	c := &localFile{
+		attachPoint: l.attachPoint,
+		hostPath:    path.Join(l.hostPath, name),
+		file:        child,
+		mode:        mode,
+	}
+
+	cu.Release()
+	return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
+}
+
+// Mkdir implements p9.File.
+func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
+		return p9.QID{}, syscall.EBADF
+	}
+
+	if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	cu := specutils.MakeCleanup(func() {
+		// Best effort attempt to remove the dir in case of failure.
+		if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil {
+			log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
+		}
+	})
+	defer cu.Clean()
+
+	// Open directory to change ownership and stat it.
+	flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+	f, err := fd.OpenAt(l.file, name, flags, 0)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	defer f.Close()
+
+	if err := fchown(f.FD(), uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := stat(f.FD())
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+
+	cu.Release()
+	return l.attachPoint.makeQID(stat), nil
+}
+
+// Walk implements p9.File.
+func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+	// Duplicate current file if 'names' is empty.
+	if len(names) == 0 {
+		newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
+			return reopenProcFd(l.file, openFlags|mode)
+		})
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+
+		stat, err := stat(newFile.FD())
+		if err != nil {
+			newFile.Close()
+			return nil, nil, extractErrno(err)
+		}
+
+		c := &localFile{
+			attachPoint: l.attachPoint,
+			hostPath:    l.hostPath,
+			file:        newFile,
+			mode:        invalidMode,
+		}
+		return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
+	}
+
+	var qids []p9.QID
+	last := l
+	for _, name := range names {
+		f, path, err := openAnyFileFromParent(last, name)
+		if last != l {
+			last.Close()
+		}
+		if err != nil {
+			return nil, nil, extractErrno(err)
+		}
+		stat, err := stat(f.FD())
+		if err != nil {
+			f.Close()
+			return nil, nil, extractErrno(err)
+		}
+		c, err := newLocalFile(last.attachPoint, f, path, stat)
+		if err != nil {
+			f.Close()
+			return nil, nil, extractErrno(err)
+		}
+
+		qids = append(qids, l.attachPoint.makeQID(stat))
+		last = c
+	}
+	return qids, last, nil
+}
+
+// StatFS implements p9.File.
+func (l *localFile) StatFS() (p9.FSStat, error) {
+	var s syscall.Statfs_t
+	if err := syscall.Fstatfs(l.file.FD(), &s); err != nil {
+		return p9.FSStat{}, extractErrno(err)
+	}
+
+	// Populate with what's available.
+	return p9.FSStat{
+		Type:            uint32(s.Type),
+		BlockSize:       uint32(s.Bsize),
+		Blocks:          s.Blocks,
+		BlocksFree:      s.Bfree,
+		BlocksAvailable: s.Bavail,
+		Files:           s.Files,
+		FilesFree:       s.Ffree,
+		NameLength:      uint32(s.Namelen),
+	}, nil
+}
+
+// FSync implements p9.File.
+func (l *localFile) FSync() error {
+	if !l.isOpen() {
+		return syscall.EBADF
+	}
+	if err := syscall.Fsync(l.file.FD()); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// GetAttr implements p9.File.
+func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	stat, err := stat(l.file.FD())
+	if err != nil {
+		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
+	}
+
+	attr := p9.Attr{
+		Mode:             p9.FileMode(stat.Mode),
+		UID:              p9.UID(stat.Uid),
+		GID:              p9.GID(stat.Gid),
+		NLink:            uint64(stat.Nlink),
+		RDev:             stat.Rdev,
+		Size:             uint64(stat.Size),
+		BlockSize:        uint64(stat.Blksize),
+		Blocks:           uint64(stat.Blocks),
+		ATimeSeconds:     uint64(stat.Atim.Sec),
+		ATimeNanoSeconds: uint64(stat.Atim.Nsec),
+		MTimeSeconds:     uint64(stat.Mtim.Sec),
+		MTimeNanoSeconds: uint64(stat.Mtim.Nsec),
+		CTimeSeconds:     uint64(stat.Ctim.Sec),
+		CTimeNanoSeconds: uint64(stat.Ctim.Nsec),
+	}
+	valid := p9.AttrMask{
+		Mode:   true,
+		UID:    true,
+		GID:    true,
+		NLink:  true,
+		RDev:   true,
+		Size:   true,
+		Blocks: true,
+		ATime:  true,
+		MTime:  true,
+		CTime:  true,
+	}
+
+	return l.attachPoint.makeQID(stat), valid, attr, nil
+}
+
+// SetAttr implements p9.File. Due to mismatch in file API, options
+// cannot be changed atomically and user may see partial changes when
+// an error happens.
+func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
+		return syscall.EBADF
+	}
+
+	allowed := p9.SetAttrMask{
+		Permissions:        true,
+		UID:                true,
+		GID:                true,
+		Size:               true,
+		ATime:              true,
+		MTime:              true,
+		ATimeNotSystemTime: true,
+		MTimeNotSystemTime: true,
+	}
+
+	if valid.Empty() {
+		// Nothing to do.
+		return nil
+	}
+
+	// Handle all the sanity checks up front so that the client gets a
+	// consistent result that is not attribute dependent.
+	if !valid.IsSubsetOf(allowed) {
+		log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid)
+		return syscall.EPERM
+	}
+
+	// Check if it's possible to use cached file, or if another one needs to be
+	// opened for write.
+	f := l.file
+	if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+		var err error
+		f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY)
+		if err != nil {
+			return extractErrno(err)
+		}
+		defer f.Close()
+	}
+
+	// The semantics are to either return an error if no changes were made,
+	// or no error if *all* changes were made. Well, this can be impossible
+	// if the filesystem rejects at least one of the changes, especially
+	// since some operations are not easy to undo atomically.
+	//
+	// This could be made better if SetAttr actually returned the changes
+	// it did make, so the client can at least know what has changed. So
+	// we at least attempt to make all of the changes and return a generic
+	// error if any of them fails, which at least doesn't bias any change
+	// over another.
+	var err error
+	if valid.Permissions {
+		if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
+			log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
+			err = extractErrno(cerr)
+		}
+	}
+
+	if valid.Size {
+		if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
+			log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
+			err = extractErrno(terr)
+		}
+	}
+
+	if valid.ATime || valid.MTime {
+		utimes := [2]syscall.Timespec{
+			{Sec: 0, Nsec: linux.UTIME_OMIT},
+			{Sec: 0, Nsec: linux.UTIME_OMIT},
+		}
+		if valid.ATime {
+			if valid.ATimeNotSystemTime {
+				utimes[0].Sec = int64(attr.ATimeSeconds)
+				utimes[0].Nsec = int64(attr.ATimeNanoSeconds)
+			} else {
+				utimes[0].Nsec = linux.UTIME_NOW
+			}
+		}
+		if valid.MTime {
+			if valid.MTimeNotSystemTime {
+				utimes[1].Sec = int64(attr.MTimeSeconds)
+				utimes[1].Nsec = int64(attr.MTimeNanoSeconds)
+			} else {
+				utimes[1].Nsec = linux.UTIME_NOW
+			}
+		}
+
+		if l.ft == symlink {
+			// utimensat operates different that other syscalls. To operate on a
+			// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
+			// name.
+			parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+			if err != nil {
+				return extractErrno(err)
+			}
+			defer syscall.Close(parent)
+
+			if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
+				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+				err = extractErrno(terr)
+			}
+		} else {
+			// Directories and regular files can operate directly on the fd
+			// using empty name.
+			if terr := utimensat(f.FD(), "", utimes, 0); terr != nil {
+				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
+				err = extractErrno(terr)
+			}
+		}
+	}
+
+	if valid.UID || valid.GID {
+		uid := -1
+		if valid.UID {
+			uid = int(attr.UID)
+		}
+		gid := -1
+		if valid.GID {
+			gid = int(attr.GID)
+		}
+		if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
+			err = extractErrno(oerr)
+		}
+	}
+
+	return err
+}
+
+func (*localFile) GetXattr(string, uint64) (string, error) {
+	return "", syscall.EOPNOTSUPP
+}
+
+func (*localFile) SetXattr(string, string, uint32) error {
+	return syscall.EOPNOTSUPP
+}
+
+func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
+	return nil, syscall.EOPNOTSUPP
+}
+
+func (*localFile) RemoveXattr(string) error {
+	return syscall.EOPNOTSUPP
+}
+
+// Allocate implements p9.File.
+func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
+	if !l.isOpen() {
+		return syscall.EBADF
+	}
+
+	if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// Rename implements p9.File; this should never be called.
+func (*localFile) Rename(p9.File, string) error {
+	panic("rename called directly")
+}
+
+// RenameAt implements p9.File.RenameAt.
+func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
+		return syscall.EBADF
+	}
+
+	newParent := directory.(*localFile)
+	if err := renameat(l.file.FD(), oldName, newParent.file.FD(), newName); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// ReadAt implements p9.File.
+func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
+	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+		return 0, syscall.EBADF
+	}
+	if !l.isOpen() {
+		return 0, syscall.EBADF
+	}
+
+	r, err := l.file.ReadAt(p, int64(offset))
+	switch err {
+	case nil, io.EOF:
+		return r, nil
+	default:
+		return r, extractErrno(err)
+	}
+}
+
+// WriteAt implements p9.File.
+func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
+	if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+		return 0, syscall.EBADF
+	}
+	if !l.isOpen() {
+		return 0, syscall.EBADF
+	}
+
+	w, err := l.file.WriteAt(p, int64(offset))
+	if err != nil {
+		return w, extractErrno(err)
+	}
+	return w, nil
+}
+
+// Symlink implements p9.File.
+func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
+		return p9.QID{}, syscall.EBADF
+	}
+
+	if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	cu := specutils.MakeCleanup(func() {
+		// Best effort attempt to remove the symlink in case of failure.
+		if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
+			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
+		}
+	})
+	defer cu.Clean()
+
+	// Open symlink to change ownership and stat it.
+	f, err := fd.OpenAt(l.file, newName, unix.O_PATH|openFlags, 0)
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	defer f.Close()
+
+	if err := fchown(f.FD(), uid, gid); err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+	stat, err := stat(f.FD())
+	if err != nil {
+		return p9.QID{}, extractErrno(err)
+	}
+
+	cu.Release()
+	return l.attachPoint.makeQID(stat), nil
+}
+
+// Link implements p9.File.
+func (l *localFile) Link(target p9.File, newName string) error {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
+		return syscall.EBADF
+	}
+
+	targetFile := target.(*localFile)
+	if err := unix.Linkat(targetFile.file.FD(), "", l.file.FD(), newName, linux.AT_EMPTY_PATH); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// Mknod implements p9.File.
+//
+// Not implemented.
+func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+	// From mknod(2) man page:
+	// "EPERM: [...] if the filesystem containing pathname does not support
+	// the type of node requested."
+	return p9.QID{}, syscall.EPERM
+}
+
+// UnlinkAt implements p9.File.
+func (l *localFile) UnlinkAt(name string, flags uint32) error {
+	conf := l.attachPoint.conf
+	if conf.ROMount {
+		if conf.PanicOnWrite {
+			panic("attempt to write to RO mount")
+		}
+		return syscall.EBADF
+	}
+
+	if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil {
+		return extractErrno(err)
+	}
+	return nil
+}
+
+// Readdir implements p9.File.
+func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
+	if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
+		return nil, syscall.EBADF
+	}
+	if !l.isOpen() {
+		return nil, syscall.EBADF
+	}
+
+	// Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
+	// reading all directory contents. Take a lock because this operation is
+	// stateful.
+	l.readDirMu.Lock()
+	defer l.readDirMu.Unlock()
+
+	skip := uint64(0)
+
+	// Check if the file is at the correct position already. If not, seek to the
+	// beginning and read the entire directory again.
+	if l.lastDirentOffset != offset {
+		if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
+			return nil, extractErrno(err)
+		}
+		skip = offset
+	}
+
+	dirents, err := l.readDirent(l.file.FD(), offset, count, skip)
+	if err == nil {
+		// On success, remember the offset that was returned at the current
+		// position.
+		l.lastDirentOffset = offset + uint64(len(dirents))
+	} else {
+		// On failure, the state is unknown, force call to seek() next time.
+		l.lastDirentOffset = math.MaxUint64
+	}
+	return dirents, err
+}
+
+func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) {
+	var dirents []p9.Dirent
+
+	// Limit 'count' to cap the slice size that is returned.
+	const maxCount = 100000
+	if count > maxCount {
+		count = maxCount
+	}
+
+	// Pre-allocate buffers that will be reused to get partial results.
+	direntsBuf := make([]byte, 8192)
+	names := make([]string, 0, 100)
+
+	end := offset + uint64(count)
+	for offset < end {
+		dirSize, err := syscall.ReadDirent(f, direntsBuf)
+		if err != nil {
+			return dirents, err
+		}
+		if dirSize <= 0 {
+			return dirents, nil
+		}
+
+		names := names[:0]
+		_, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names)
+
+		// Skip over entries that the caller is not interested in.
+		if skip > 0 {
+			if skip > uint64(len(names)) {
+				skip -= uint64(len(names))
+				names = names[:0]
+			} else {
+				names = names[skip:]
+				skip = 0
+			}
+		}
+		for _, name := range names {
+			stat, err := statAt(l.file.FD(), name)
+			if err != nil {
+				log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err)
+				continue
+			}
+			qid := l.attachPoint.makeQID(stat)
+			offset++
+			dirents = append(dirents, p9.Dirent{
+				QID:    qid,
+				Type:   qid.Type,
+				Name:   name,
+				Offset: offset,
+			})
+		}
+	}
+	return dirents, nil
+}
+
+// Readlink implements p9.File.
+func (l *localFile) Readlink() (string, error) {
+	// Shamelessly stolen from os.Readlink (added upper bound limit to buffer).
+	const limit = 1024 * 1024
+	for len := 128; len < limit; len *= 2 {
+		b := make([]byte, len)
+		n, err := unix.Readlinkat(l.file.FD(), "", b)
+		if err != nil {
+			return "", extractErrno(err)
+		}
+		if n < len {
+			return string(b[:n]), nil
+		}
+	}
+	return "", syscall.ENOMEM
+}
+
+// Flush implements p9.File.
+func (l *localFile) Flush() error {
+	return nil
+}
+
+// Connect implements p9.File.
+func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
+	if !l.attachPoint.conf.HostUDS {
+		return nil, syscall.ECONNREFUSED
+	}
+
+	// TODO(gvisor.dev/issue/1003): Due to different app vs replacement
+	// mappings, the app path may have fit in the sockaddr, but we can't
+	// fit f.path in our sockaddr. We'd need to redirect through a shorter
+	// path in order to actually connect to this socket.
+	if len(l.hostPath) > linux.UnixPathMax {
+		return nil, syscall.ECONNREFUSED
+	}
+
+	var stype int
+	switch flags {
+	case p9.StreamSocket:
+		stype = syscall.SOCK_STREAM
+	case p9.DgramSocket:
+		stype = syscall.SOCK_DGRAM
+	case p9.SeqpacketSocket:
+		stype = syscall.SOCK_SEQPACKET
+	default:
+		return nil, syscall.ENXIO
+	}
+
+	f, err := syscall.Socket(syscall.AF_UNIX, stype, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := syscall.SetNonblock(f, true); err != nil {
+		syscall.Close(f)
+		return nil, err
+	}
+
+	sa := syscall.SockaddrUnix{Name: l.hostPath}
+	if err := syscall.Connect(f, &sa); err != nil {
+		syscall.Close(f)
+		return nil, err
+	}
+
+	return fd.New(f), nil
+}
+
+// Close implements p9.File.
+func (l *localFile) Close() error {
+	l.mode = invalidMode
+	err := l.file.Close()
+	l.file = nil
+	return err
+}
+
+func (l *localFile) isOpen() bool {
+	return l.mode != invalidMode
+}
+
+// Renamed implements p9.Renamed.
+func (l *localFile) Renamed(newDir p9.File, newName string) {
+	l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
+}
+
+// extractErrno tries to determine the errno.
+func extractErrno(err error) syscall.Errno {
+	if err == nil {
+		// This should never happen. The likely result will be that
+		// some user gets the frustrating "error: SUCCESS" message.
+		log.Warningf("extractErrno called with nil error!")
+		return 0
+	}
+
+	switch err {
+	case os.ErrNotExist:
+		return syscall.ENOENT
+	case os.ErrExist:
+		return syscall.EEXIST
+	case os.ErrPermission:
+		return syscall.EACCES
+	case os.ErrInvalid:
+		return syscall.EINVAL
+	}
+
+	// See if it's an errno or a common wrapped error.
+	switch e := err.(type) {
+	case syscall.Errno:
+		return e
+	case *os.PathError:
+		return extractErrno(e.Err)
+	case *os.LinkError:
+		return extractErrno(e.Err)
+	case *os.SyscallError:
+		return extractErrno(e.Err)
+	}
+
+	// Fall back to EIO.
+	log.Debugf("Unknown error: %v, defaulting to EIO", err)
+	return syscall.EIO
+}
diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go
new file mode 100644
index 000000000..5d4aab597
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+	nameBytes, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return syscall.Stat_t{}, err
+	}
+	namePtr := unsafe.Pointer(nameBytes)
+
+	var stat syscall.Stat_t
+	statPtr := unsafe.Pointer(&stat)
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_NEWFSTATAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(statPtr),
+		linux.AT_SYMLINK_NOFOLLOW,
+		0,
+		0); errno != 0 {
+
+		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+	}
+	return stat, nil
+}
diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go
new file mode 100644
index 000000000..8041fd352
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+	nameBytes, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return syscall.Stat_t{}, err
+	}
+	namePtr := unsafe.Pointer(nameBytes)
+
+	var stat syscall.Stat_t
+	statPtr := unsafe.Pointer(&stat)
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_FSTATAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(statPtr),
+		linux.AT_SYMLINK_NOFOLLOW,
+		0,
+		0); errno != 0 {
+
+		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+	}
+	return stat, nil
+}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
new file mode 100644
index 000000000..05af7e397
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -0,0 +1,692 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+	"fmt"
+	"io/ioutil"
+	"net"
+	"os"
+	"path"
+	"path/filepath"
+	"syscall"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+
+	allConfs = append(allConfs, rwConfs...)
+	allConfs = append(allConfs, roConfs...)
+
+	if err := OpenProcSelfFD(); err != nil {
+		panic(err)
+	}
+}
+
+func assertPanic(t *testing.T, f func()) {
+	defer func() {
+		if r := recover(); r == nil {
+			t.Errorf("function did not panic")
+		}
+	}()
+	f()
+}
+
+func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
+	want := make([]byte, len(content))
+	copy(want, content)
+
+	b := []byte("test-1-2-3")
+	w, err := f.WriteAt(b, uint64(len(content)))
+	if flags == p9.WriteOnly || flags == p9.ReadWrite {
+		if err != nil {
+			return fmt.Errorf("WriteAt(): %v", err)
+		}
+		if w != len(b) {
+			return fmt.Errorf("WriteAt() was partial, got: %d, want: %d", w, len(b))
+		}
+		want = append(want, b...)
+	} else {
+		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+			return fmt.Errorf("WriteAt() should have failed, got: %d, want: EBADFD", err)
+		}
+	}
+
+	rBuf := make([]byte, len(want))
+	r, err := f.ReadAt(rBuf, 0)
+	if flags == p9.ReadOnly || flags == p9.ReadWrite {
+		if err != nil {
+			return fmt.Errorf("ReadAt(): %v", err)
+		}
+		if r != len(rBuf) {
+			return fmt.Errorf("ReadAt() was partial, got: %d, want: %d", r, len(rBuf))
+		}
+		if string(rBuf) != string(want) {
+			return fmt.Errorf("ReadAt() wrong data, got: %s, want: %s", string(rBuf), want)
+		}
+	} else {
+		if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+			return fmt.Errorf("ReadAt() should have failed, got: %d, want: EBADFD", err)
+		}
+	}
+	return nil
+}
+
+var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
+
+var (
+	allTypes = []fileType{regular, directory, symlink}
+
+	// allConfs is set in init() above.
+	allConfs []Config
+
+	rwConfs = []Config{{ROMount: false}}
+	roConfs = []Config{{ROMount: true}}
+)
+
+type state struct {
+	root *localFile
+	file *localFile
+	conf Config
+	ft   fileType
+}
+
+func (s state) String() string {
+	return fmt.Sprintf("type(%v)", s.ft)
+}
+
+func runAll(t *testing.T, test func(*testing.T, state)) {
+	runCustom(t, allTypes, allConfs, test)
+}
+
+func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) {
+	for _, c := range confs {
+		t.Logf("Config: %+v", c)
+
+		for _, ft := range types {
+			t.Logf("File type: %v", ft)
+
+			path, name, err := setup(ft)
+			if err != nil {
+				t.Fatalf("%v", err)
+			}
+			defer os.RemoveAll(path)
+
+			a, err := NewAttachPoint(path, c)
+			if err != nil {
+				t.Fatalf("NewAttachPoint failed: %v", err)
+			}
+			root, err := a.Attach()
+			if err != nil {
+				t.Fatalf("Attach failed, err: %v", err)
+			}
+
+			_, file, err := root.Walk([]string{name})
+			if err != nil {
+				root.Close()
+				t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
+			}
+
+			st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft}
+			test(t, st)
+			file.Close()
+			root.Close()
+		}
+	}
+}
+
+func setup(ft fileType) (string, string, error) {
+	path, err := ioutil.TempDir("", "root-")
+	if err != nil {
+		return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err)
+	}
+
+	// First attach with writable configuration to setup tree.
+	a, err := NewAttachPoint(path, Config{})
+	if err != nil {
+		return "", "", err
+	}
+	root, err := a.Attach()
+	if err != nil {
+		return "", "", fmt.Errorf("Attach failed, err: %v", err)
+	}
+	defer root.Close()
+
+	var name string
+	switch ft {
+	case regular:
+		name = "file"
+		_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
+		}
+		defer f.Close()
+	case directory:
+		name = "dir"
+		if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
+		}
+	case symlink:
+		name = "symlink"
+		if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
+		}
+	default:
+		panic(fmt.Sprintf("unknown file type %v", ft))
+	}
+	return path, name, nil
+}
+
+func createFile(dir *localFile, name string) (*localFile, error) {
+	_, f, _, _, err := dir.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+	if err != nil {
+		return nil, err
+	}
+	return f.(*localFile), nil
+}
+
+func TestReadWrite(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		child, err := createFile(s.file, "test")
+		if err != nil {
+			t.Fatalf("%v: createFile() failed, err: %v", s, err)
+		}
+		defer child.Close()
+		want := []byte("foobar")
+		w, err := child.WriteAt(want, 0)
+		if err != nil {
+			t.Fatalf("%v: Write() failed, err: %v", s, err)
+		}
+		if w != len(want) {
+			t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(want))
+		}
+		for _, flags := range allOpenFlags {
+			_, l, err := s.file.Walk([]string{"test"})
+			if err != nil {
+				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
+			}
+			if _, _, _, err := l.Open(flags); err != nil {
+				t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err)
+			}
+			if err := testReadWrite(l, flags, want); err != nil {
+				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
+			}
+		}
+	})
+}
+
+func TestCreate(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		for i, flags := range allOpenFlags {
+			_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+			if err != nil {
+				t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, flags, err)
+			}
+
+			if err := testReadWrite(l, flags, []byte{}); err != nil {
+				t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
+			}
+		}
+	})
+}
+
+// TestReadWriteDup tests that a file opened in any mode can be dup'ed and
+// reopened in any other mode.
+func TestReadWriteDup(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		child, err := createFile(s.file, "test")
+		if err != nil {
+			t.Fatalf("%v: createFile() failed, err: %v", s, err)
+		}
+		defer child.Close()
+		want := []byte("foobar")
+		w, err := child.WriteAt(want, 0)
+		if err != nil {
+			t.Fatalf("%v: Write() failed, err: %v", s, err)
+		}
+		if w != len(want) {
+			t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(want))
+		}
+		for _, flags := range allOpenFlags {
+			_, l, err := s.file.Walk([]string{"test"})
+			if err != nil {
+				t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
+			}
+			defer l.Close()
+			if _, _, _, err := l.Open(flags); err != nil {
+				t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err)
+			}
+			for _, dupFlags := range allOpenFlags {
+				t.Logf("Original flags: %v, dup flags: %v", flags, dupFlags)
+				_, dup, err := l.Walk([]string{})
+				if err != nil {
+					t.Fatalf("%v: Walk(<empty>) failed: %v", s, err)
+				}
+				defer dup.Close()
+				if _, _, _, err := dup.Open(dupFlags); err != nil {
+					t.Fatalf("%v: Open(%v) failed: %v", s, flags, err)
+				}
+				if err := testReadWrite(dup, dupFlags, want); err != nil {
+					t.Fatalf("%v: testReadWrite(%v) failed: %v", s, dupFlags, err)
+				}
+			}
+		}
+	})
+}
+
+func TestUnopened(t *testing.T) {
+	runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) {
+		b := []byte("foobar")
+		if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
+			t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF {
+			t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Readdir(0, 100); err != syscall.EBADF {
+			t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.FSync(); err != syscall.EBADF {
+			t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+	})
+}
+
+func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) {
+	if err := l.SetAttr(valid, attr); err != nil {
+		return p9.Attr{}, err
+	}
+	_, _, a, err := l.GetAttr(p9.AttrMask{})
+	if err != nil {
+		return p9.Attr{}, err
+	}
+	return a, nil
+}
+
+func TestSetAttrPerm(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		valid := p9.SetAttrMask{Permissions: true}
+		attr := p9.SetAttr{Permissions: 0777}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if s.ft == symlink {
+			if err == nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
+			}
+		} else {
+			if err != nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Permissions, err)
+			}
+			if got.Mode.Permissions() != attr.Permissions {
+				t.Errorf("%v: wrong permission, got: %v, expected: %v", s, got.Mode.Permissions(), attr.Permissions)
+			}
+		}
+	})
+}
+
+func TestSetAttrSize(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		for _, size := range []uint64{1024, 0, 1024 * 1024} {
+			valid := p9.SetAttrMask{Size: true}
+			attr := p9.SetAttr{Size: size}
+			got, err := SetGetAttr(s.file, valid, attr)
+			if s.ft == symlink || s.ft == directory {
+				if err == nil {
+					t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
+				}
+				// Run for one size only, they will all fail the same way.
+				return
+			}
+			if err != nil {
+				t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Size, err)
+			}
+			if got.Size != size {
+				t.Errorf("%v: wrong size, got: %v, expected: %v", s, got.Size, size)
+			}
+		}
+	})
+}
+
+func TestSetAttrTime(t *testing.T) {
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		valid := p9.SetAttrMask{ATime: true, ATimeNotSystemTime: true}
+		attr := p9.SetAttr{ATimeSeconds: 123, ATimeNanoSeconds: 456}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.ATimeSeconds, attr.ATimeNanoSeconds, err)
+		}
+		if got.ATimeSeconds != 123 {
+			t.Errorf("%v: wrong ATimeSeconds, got: %v, expected: %v", s, got.ATimeSeconds, 123)
+		}
+		if got.ATimeNanoSeconds != 456 {
+			t.Errorf("%v: wrong ATimeNanoSeconds, got: %v, expected: %v", s, got.ATimeNanoSeconds, 456)
+		}
+
+		valid = p9.SetAttrMask{MTime: true, MTimeNotSystemTime: true}
+		attr = p9.SetAttr{MTimeSeconds: 789, MTimeNanoSeconds: 012}
+		got, err = SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.MTimeSeconds, attr.MTimeNanoSeconds, err)
+		}
+		if got.MTimeSeconds != 789 {
+			t.Errorf("%v: wrong MTimeSeconds, got: %v, expected: %v", s, got.MTimeSeconds, 789)
+		}
+		if got.MTimeNanoSeconds != 012 {
+			t.Errorf("%v: wrong MTimeNanoSeconds, got: %v, expected: %v", s, got.MTimeNanoSeconds, 012)
+		}
+	})
+}
+
+func TestSetAttrOwner(t *testing.T) {
+	if os.Getuid() != 0 {
+		t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid())
+	}
+
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		newUID := os.Getuid() + 1
+		valid := p9.SetAttrMask{UID: true}
+		attr := p9.SetAttr{UID: p9.UID(newUID)}
+		got, err := SetGetAttr(s.file, valid, attr)
+		if err != nil {
+			t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.UID, err)
+		}
+		if got.UID != p9.UID(newUID) {
+			t.Errorf("%v: wrong uid, got: %v, expected: %v", s, got.UID, newUID)
+		}
+	})
+}
+
+func TestLink(t *testing.T) {
+	if os.Getuid() != 0 {
+		t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid())
+	}
+	runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) {
+		const dirName = "linkdir"
+		const linkFile = "link"
+		if _, err := s.root.Mkdir(dirName, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, dirName, err)
+		}
+		_, dir, err := s.root.Walk([]string{dirName})
+		if err != nil {
+			t.Fatalf("%v: Walk({%s}) failed, err: %v", s, dirName, err)
+		}
+
+		err = dir.Link(s.file, linkFile)
+		if s.ft == directory {
+			if err != syscall.EPERM {
+				t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
+			}
+			return
+		}
+		if err != nil {
+			t.Errorf("%v: Link(target, %s) failed, err: %v", s, linkFile, err)
+		}
+	})
+}
+
+func TestROMountChecks(t *testing.T) {
+	runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
+		if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.RenameAt("some_file", s.file, "other_file"); err != syscall.EBADF {
+			t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
+			t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.UnlinkAt("some_file", 0); err != syscall.EBADF {
+			t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+		if err := s.file.Link(s.file, "some_link"); err != syscall.EBADF {
+			t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+
+		valid := p9.SetAttrMask{Size: true}
+		attr := p9.SetAttr{Size: 0}
+		if err := s.file.SetAttr(valid, attr); err != syscall.EBADF {
+			t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err)
+		}
+	})
+}
+
+func TestROMountPanics(t *testing.T) {
+	conf := Config{ROMount: true, PanicOnWrite: true}
+	runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
+		assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") })
+		assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
+		assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) })
+		assertPanic(t, func() { s.file.Link(s.file, "some_link") })
+
+		valid := p9.SetAttrMask{Size: true}
+		attr := p9.SetAttr{Size: 0}
+		assertPanic(t, func() { s.file.SetAttr(valid, attr) })
+	})
+}
+
+func TestWalkNotFound(t *testing.T) {
+	runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
+		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
+			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
+		}
+	})
+}
+
+func TestWalkDup(t *testing.T) {
+	runAll(t, func(t *testing.T, s state) {
+		_, dup, err := s.file.Walk([]string{})
+		if err != nil {
+			t.Fatalf("%v: Walk(nil) failed, err: %v", s, err)
+		}
+		// Check that 'dup' is usable.
+		if _, _, _, err := dup.GetAttr(p9.AttrMask{}); err != nil {
+			t.Errorf("%v: GetAttr() failed, err: %v", s, err)
+		}
+	})
+}
+
+func TestReaddir(t *testing.T) {
+	runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+		name := "dir"
+		if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
+		}
+		name = "symlink"
+		if _, err := s.file.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+			t.Fatalf("%v: Symlink(%q) failed, err: %v", s, name, err)
+		}
+		name = "file"
+		_, f, _, _, err := s.file.Create(name, p9.ReadWrite, 0555, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		if err != nil {
+			t.Fatalf("%v: createFile(root, %q) failed, err: %v", s, name, err)
+		}
+		f.Close()
+
+		if _, _, _, err := s.file.Open(p9.ReadOnly); err != nil {
+			t.Fatalf("%v: Open(ReadOnly) failed, err: %v", s, err)
+		}
+
+		dirents, err := s.file.Readdir(0, 10)
+		if err != nil {
+			t.Fatalf("%v: Readdir(0, 10) failed, err: %v", s, err)
+		}
+		if len(dirents) != 3 {
+			t.Fatalf("%v: Readdir(0, 10) wrong number of items, got: %v, expected: 3", s, len(dirents))
+		}
+		var dir, symlink, file bool
+		for _, d := range dirents {
+			switch d.Name {
+			case "dir":
+				if d.Type != p9.TypeDir {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeDir)
+				}
+				dir = true
+			case "symlink":
+				if d.Type != p9.TypeSymlink {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeSymlink)
+				}
+				symlink = true
+			case "file":
+				if d.Type != p9.TypeRegular {
+					t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeRegular)
+				}
+				file = true
+			default:
+				t.Errorf("%v: dirent.Name got: %v", s, d.Name)
+			}
+
+			_, f, err := s.file.Walk([]string{d.Name})
+			if err != nil {
+				t.Fatalf("%v: Walk({%s}) failed, err: %v", s, d.Name, err)
+			}
+			_, _, a, err := f.GetAttr(p9.AttrMask{})
+			if err != nil {
+				t.Fatalf("%v: GetAttr() failed, err: %v", s, err)
+			}
+			if d.Type != a.Mode.QIDType() {
+				t.Errorf("%v: dirent.Type different than GetAttr().Mode.QIDType(), got: %v, expected: %v", s, d.Type, a.Mode.QIDType())
+			}
+		}
+		if !dir || !symlink || !file {
+			t.Errorf("%v: Readdir(0, 10) wrong files returned, dir: %v, symlink: %v, file: %v", s, dir, symlink, file)
+		}
+	})
+}
+
+// Test that attach point can be written to when it points to a file, e.g.
+// /etc/hosts.
+func TestAttachFile(t *testing.T) {
+	conf := Config{ROMount: false}
+	dir, err := ioutil.TempDir("", "root-")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed, err: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	path := path.Join(dir, "test")
+	if _, err := os.Create(path); err != nil {
+		t.Fatalf("os.Create(%q) failed, err: %v", path, err)
+	}
+
+	a, err := NewAttachPoint(path, conf)
+	if err != nil {
+		t.Fatalf("NewAttachPoint failed: %v", err)
+	}
+	root, err := a.Attach()
+	if err != nil {
+		t.Fatalf("Attach failed, err: %v", err)
+	}
+
+	if _, _, _, err := root.Open(p9.ReadWrite); err != nil {
+		t.Fatalf("Open(ReadWrite) failed, err: %v", err)
+	}
+	defer root.Close()
+
+	b := []byte("foobar")
+	w, err := root.WriteAt(b, 0)
+	if err != nil {
+		t.Fatalf("Write() failed, err: %v", err)
+	}
+	if w != len(b) {
+		t.Fatalf("Write() was partial, got: %d, expected: %d", w, len(b))
+	}
+	rBuf := make([]byte, len(b))
+	r, err := root.ReadAt(rBuf, 0)
+	if err != nil {
+		t.Fatalf("ReadAt() failed, err: %v", err)
+	}
+	if r != len(rBuf) {
+		t.Fatalf("ReadAt() was partial, got: %d, expected: %d", r, len(rBuf))
+	}
+	if string(rBuf) != "foobar" {
+		t.Fatalf("ReadAt() wrong data, got: %s, expected: %s", string(rBuf), "foobar")
+	}
+}
+
+func TestAttachInvalidType(t *testing.T) {
+	dir, err := ioutil.TempDir("", "attach-")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed, err: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	fifo := filepath.Join(dir, "fifo")
+	if err := syscall.Mkfifo(fifo, 0755); err != nil {
+		t.Fatalf("Mkfifo(%q): %v", fifo, err)
+	}
+
+	dirFile, err := os.Open(dir)
+	if err != nil {
+		t.Fatalf("Open(%s): %v", dir, err)
+	}
+	defer dirFile.Close()
+
+	// Bind a socket via /proc to be sure that a length of a socket path
+	// is less than UNIX_PATH_MAX.
+	socket := filepath.Join(fmt.Sprintf("/proc/self/fd/%d", dirFile.Fd()), "socket")
+	l, err := net.Listen("unix", socket)
+	if err != nil {
+		t.Fatalf("net.Listen(unix, %q): %v", socket, err)
+	}
+	defer l.Close()
+
+	for _, tc := range []struct {
+		name string
+		path string
+	}{
+		{name: "fifo", path: fifo},
+		{name: "socket", path: socket},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			conf := Config{ROMount: false}
+			a, err := NewAttachPoint(tc.path, conf)
+			if err != nil {
+				t.Fatalf("NewAttachPoint failed: %v", err)
+			}
+			f, err := a.Attach()
+			if f != nil || err == nil {
+				t.Fatalf("Attach should have failed, got (%v, %v)", f, err)
+			}
+		})
+	}
+}
+
+func TestDoubleAttachError(t *testing.T) {
+	conf := Config{ROMount: false}
+	root, err := ioutil.TempDir("", "root-")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed, err: %v", err)
+	}
+	defer os.RemoveAll(root)
+	a, err := NewAttachPoint(root, conf)
+	if err != nil {
+		t.Fatalf("NewAttachPoint failed: %v", err)
+	}
+
+	if _, err := a.Attach(); err != nil {
+		t.Fatalf("Attach failed: %v", err)
+	}
+	if _, err := a.Attach(); err == nil {
+		t.Fatalf("Attach should have failed, got %v want non-nil", err)
+	}
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
new file mode 100644
index 000000000..542b54365
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -0,0 +1,82 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
+	// operate directly on 'dirFd' unlike other *at syscalls.
+	var namePtr unsafe.Pointer
+	if name != "" {
+		nameBytes, err := syscall.BytePtrFromString(name)
+		if err != nil {
+			return err
+		}
+		namePtr = unsafe.Pointer(nameBytes)
+	}
+
+	timesPtr := unsafe.Pointer(&times[0])
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(timesPtr),
+		uintptr(flags),
+		0,
+		0); errno != 0 {
+
+		return syserr.FromHost(errno).ToError()
+	}
+	return nil
+}
+
+func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error {
+	var oldNamePtr unsafe.Pointer
+	if oldName != "" {
+		nameBytes, err := syscall.BytePtrFromString(oldName)
+		if err != nil {
+			return err
+		}
+		oldNamePtr = unsafe.Pointer(nameBytes)
+	}
+	var newNamePtr unsafe.Pointer
+	if newName != "" {
+		nameBytes, err := syscall.BytePtrFromString(newName)
+		if err != nil {
+			return err
+		}
+		newNamePtr = unsafe.Pointer(nameBytes)
+	}
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_RENAMEAT,
+		uintptr(oldDirFD),
+		uintptr(oldNamePtr),
+		uintptr(newDirFD),
+		uintptr(newNamePtr),
+		0,
+		0); errno != 0 {
+
+		return syserr.FromHost(errno).ToError()
+	}
+	return nil
+}
diff --git a/runsc/main.go b/runsc/main.go
new file mode 100644
index 000000000..0625a06e0
--- /dev/null
+++ b/runsc/main.go
@@ -0,0 +1,368 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary runsc is an implementation of the Open Container Initiative Runtime
+// that runs applications inside a sandbox.
+package main
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/cmd"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+var (
+	// Although these flags are not part of the OCI spec, they are used by
+	// Docker, and thus should not be changed.
+	rootDir     = flag.String("root", "", "root directory for storage of container state.")
+	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.")
+	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
+	debug       = flag.Bool("debug", false, "enable debug logging.")
+	showVersion = flag.Bool("version", false, "show version and exit.")
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
+
+	// These flags are unique to runsc, and are used to configure parts of the
+	// system that are not covered by the runtime spec.
+
+	// Debugging flags.
+	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+	panicLog        = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
+	logPackets      = flag.Bool("log-packets", false, "enable network packet logging.")
+	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	panicLogFD      = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
+	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
+	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
+
+	// Debugging flags: strace related
+	strace         = flag.Bool("strace", false, "enable strace.")
+	straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
+
+	// Flags that control sandbox runtime behavior.
+	platformName       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
+	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+	hardwareGSO        = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
+	softwareGSO        = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.")
+	qDisc              = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
+	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+	fsGoferHostUDS     = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
+	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+	overlayfsStaleRead = flag.Bool("overlayfs-stale-read", false, "reopen cached FDs after a file is opened for write to workaround overlayfs limitation on kernels before 4.19.")
+	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+	netRaw             = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
+	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
+	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+	vfs2Enabled        = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
+
+	// Test flags, not to be used outside tests, ever.
+	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
+	testOnlyTestNameEnv                        = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+)
+
+func main() {
+	// Help and flags commands are generated automatically.
+	help := cmd.NewHelp(subcommands.DefaultCommander)
+	help.Register(new(cmd.Syscalls))
+	subcommands.Register(help, "")
+	subcommands.Register(subcommands.FlagsCommand(), "")
+
+	// Installation helpers.
+	const helperGroup = "helpers"
+	subcommands.Register(new(cmd.Install), helperGroup)
+	subcommands.Register(new(cmd.Uninstall), helperGroup)
+
+	// Register user-facing runsc commands.
+	subcommands.Register(new(cmd.Checkpoint), "")
+	subcommands.Register(new(cmd.Create), "")
+	subcommands.Register(new(cmd.Delete), "")
+	subcommands.Register(new(cmd.Do), "")
+	subcommands.Register(new(cmd.Events), "")
+	subcommands.Register(new(cmd.Exec), "")
+	subcommands.Register(new(cmd.Gofer), "")
+	subcommands.Register(new(cmd.Kill), "")
+	subcommands.Register(new(cmd.List), "")
+	subcommands.Register(new(cmd.Pause), "")
+	subcommands.Register(new(cmd.PS), "")
+	subcommands.Register(new(cmd.Restore), "")
+	subcommands.Register(new(cmd.Resume), "")
+	subcommands.Register(new(cmd.Run), "")
+	subcommands.Register(new(cmd.Spec), "")
+	subcommands.Register(new(cmd.State), "")
+	subcommands.Register(new(cmd.Start), "")
+	subcommands.Register(new(cmd.Wait), "")
+
+	// Register internal commands with the internal group name. This causes
+	// them to be sorted below the user-facing commands with empty group.
+	// The string below will be printed above the commands.
+	const internalGroup = "internal use only"
+	subcommands.Register(new(cmd.Boot), internalGroup)
+	subcommands.Register(new(cmd.Debug), internalGroup)
+	subcommands.Register(new(cmd.Gofer), internalGroup)
+	subcommands.Register(new(cmd.Statefile), internalGroup)
+
+	// All subcommands must be registered before flag parsing.
+	flag.Parse()
+
+	// Are we showing the version?
+	if *showVersion {
+		// The format here is the same as runc.
+		fmt.Fprintf(os.Stdout, "runsc version %s\n", version)
+		fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version)
+		os.Exit(0)
+	}
+
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	if *systemdCgroup {
+		fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
+		os.Exit(1)
+	}
+
+	var errorLogger io.Writer
+	if *logFD > -1 {
+		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
+
+	} else if *logFilename != "" {
+		// We must set O_APPEND and not O_TRUNC because Docker passes
+		// the same log file for all commands (and also parses these
+		// log files), so we can't destroy them on each command.
+		var err error
+		errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+		}
+	}
+	cmd.ErrorLogger = errorLogger
+
+	platformType := *platformName
+	if _, err := platform.Lookup(platformType); err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	fsAccess, err := boot.MakeFileAccessType(*fileAccess)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	if fsAccess == boot.FileAccessShared && *overlay {
+		cmd.Fatalf("overlay flag is incompatible with shared file access")
+	}
+
+	netType, err := boot.MakeNetworkType(*network)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	wa, err := boot.MakeWatchdogAction(*watchdogAction)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	if *numNetworkChannels <= 0 {
+		cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
+	}
+
+	refsLeakMode, err := boot.MakeRefsLeakMode(*referenceLeakMode)
+	if err != nil {
+		cmd.Fatalf("%v", err)
+	}
+
+	queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc)
+	if err != nil {
+		cmd.Fatalf("%s", err)
+	}
+
+	// Sets the reference leak check mode. Also set it in config below to
+	// propagate it to child processes.
+	refs.SetLeakMode(refsLeakMode)
+
+	// Create a new Config from the flags.
+	conf := &boot.Config{
+		RootDir:            *rootDir,
+		Debug:              *debug,
+		LogFilename:        *logFilename,
+		LogFormat:          *logFormat,
+		DebugLog:           *debugLog,
+		PanicLog:           *panicLog,
+		DebugLogFormat:     *debugLogFormat,
+		FileAccess:         fsAccess,
+		FSGoferHostUDS:     *fsGoferHostUDS,
+		Overlay:            *overlay,
+		Network:            netType,
+		HardwareGSO:        *hardwareGSO,
+		SoftwareGSO:        *softwareGSO,
+		LogPackets:         *logPackets,
+		Platform:           platformType,
+		Strace:             *strace,
+		StraceLogSize:      *straceLogSize,
+		WatchdogAction:     wa,
+		PanicSignal:        *panicSignal,
+		ProfileEnable:      *profile,
+		EnableRaw:          *netRaw,
+		NumNetworkChannels: *numNetworkChannels,
+		Rootless:           *rootless,
+		AlsoLogToStderr:    *alsoLogToStderr,
+		ReferenceLeakMode:  refsLeakMode,
+		OverlayfsStaleRead: *overlayfsStaleRead,
+		CPUNumFromQuota:    *cpuNumFromQuota,
+		VFS2:               *vfs2Enabled,
+		QDisc:              queueingDiscipline,
+		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
+		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
+	}
+	if len(*straceSyscalls) != 0 {
+		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
+	}
+
+	// Set up logging.
+	if *debug {
+		log.SetLevel(log.Debug)
+	}
+
+	// Logging will include the local date and time via the time package.
+	//
+	// On first use, time.Local initializes the local time zone, which
+	// involves opening tzdata files on the host. Since this requires
+	// opening host files, it must be done before syscall filter
+	// installation.
+	//
+	// Generally there will be a log message before filter installation
+	// that will force initialization, but force initialization here in
+	// case that does not occur.
+	_ = time.Local.String()
+
+	subcommand := flag.CommandLine.Arg(0)
+
+	var e log.Emitter
+	if *debugLogFD > -1 {
+		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
+
+		e = newEmitter(*debugLogFormat, f)
+
+	} else if *debugLog != "" {
+		f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */)
+		if err != nil {
+			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
+		}
+		e = newEmitter(*debugLogFormat, f)
+
+	} else {
+		// Stderr is reserved for the application, just discard the logs if no debug
+		// log is specified.
+		e = newEmitter("text", ioutil.Discard)
+	}
+
+	if *panicLogFD > -1 || *debugLogFD > -1 {
+		fd := *panicLogFD
+		if fd < 0 {
+			fd = *debugLogFD
+		}
+		// Quick sanity check to make sure no other commands get passed
+		// a log fd (they should use log dir instead).
+		if subcommand != "boot" && subcommand != "gofer" {
+			cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
+		}
+
+		// If we are the boot process, then we own our stdio FDs and can do what we
+		// want with them. Since Docker and Containerd both eat boot's stderr, we
+		// dup our stderr to the provided log FD so that panics will appear in the
+		// logs, rather than just disappear.
+		if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
+			cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
+		}
+	} else if *alsoLogToStderr {
+		e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
+	}
+
+	log.SetTarget(e)
+
+	log.Infof("***************************")
+	log.Infof("Args: %s", os.Args)
+	log.Infof("Version %s", version)
+	log.Infof("PID: %d", os.Getpid())
+	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
+	log.Infof("Configuration:")
+	log.Infof("\t\tRootDir: %s", conf.RootDir)
+	log.Infof("\t\tPlatform: %v", conf.Platform)
+	log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
+	log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
+	log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+	log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
+	log.Infof("***************************")
+
+	if *testOnlyAllowRunAsCurrentUserWithoutChroot {
+		// SIGTERM is sent to all processes if a test exceeds its
+		// timeout and this case is handled by syscall_test_runner.
+		log.Warningf("Block the TERM signal. This is only safe in tests!")
+		signal.Ignore(syscall.SIGTERM)
+	}
+
+	// Call the subcommand and pass in the configuration.
+	var ws syscall.WaitStatus
+	subcmdCode := subcommands.Execute(context.Background(), conf, &ws)
+	if subcmdCode == subcommands.ExitSuccess {
+		log.Infof("Exiting with status: %v", ws)
+		if ws.Signaled() {
+			// No good way to return it, emulate what the shell does. Maybe raise
+			// signal to self?
+			os.Exit(128 + int(ws.Signal()))
+		}
+		os.Exit(ws.ExitStatus())
+	}
+	// Return an error that is unlikely to be used by the application.
+	log.Warningf("Failure to execute command, err: %v", subcmdCode)
+	os.Exit(128)
+}
+
+func newEmitter(format string, logFile io.Writer) log.Emitter {
+	switch format {
+	case "text":
+		return log.GoogleEmitter{&log.Writer{Next: logFile}}
+	case "json":
+		return log.JSONEmitter{&log.Writer{Next: logFile}}
+	case "json-k8s":
+		return log.K8sJSONEmitter{&log.Writer{Next: logFile}}
+	}
+	cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
+	panic("unreachable")
+}
+
+func init() {
+	// Set default root dir to something (hopefully) user-writeable.
+	*rootDir = "/var/run/runsc"
+	if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+		*rootDir = filepath.Join(runtimeDir, "runsc")
+	}
+}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
new file mode 100644
index 000000000..c95d50294
--- /dev/null
+++ b/runsc/sandbox/BUILD
@@ -0,0 +1,36 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "sandbox",
+    srcs = [
+        "network.go",
+        "network_unsafe.go",
+        "sandbox.go",
+    ],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/control/client",
+        "//pkg/control/server",
+        "//pkg/log",
+        "//pkg/sentry/control",
+        "//pkg/sentry/platform",
+        "//pkg/sync",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+        "//pkg/urpc",
+        "//runsc/boot",
+        "//runsc/boot/platforms",
+        "//runsc/cgroup",
+        "//runsc/console",
+        "//runsc/specutils",
+        "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
+        "@com_github_vishvananda_netlink//:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
new file mode 100644
index 000000000..209bfdb20
--- /dev/null
+++ b/runsc/sandbox/network.go
@@ -0,0 +1,410 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"fmt"
+	"net"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/vishvananda/netlink"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// setupNetwork configures the network stack to mimic the local network
+// configuration. Docker uses network namespaces with vnets to configure the
+// network for the container. The untrusted app expects to see the same network
+// inside the sandbox. Routing and port mapping is handled directly by docker
+// with most of network information not even available to the runtime.
+//
+// Netstack inside the sandbox speaks directly to the device using a raw socket.
+// All IP addresses assigned to the NIC, are removed and passed on to netstack's
+// device.
+//
+// If 'conf.Network' is NoNetwork, skips local configuration and creates a
+// loopback interface only.
+//
+// Run the following container to test it:
+//  docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+	log.Infof("Setting up network")
+
+	switch conf.Network {
+	case boot.NetworkNone:
+		log.Infof("Network is disabled, create loopback interface only")
+		if err := createDefaultLoopbackInterface(conn); err != nil {
+			return fmt.Errorf("creating default loopback interface: %v", err)
+		}
+	case boot.NetworkSandbox:
+		// Build the path to the net namespace of the sandbox process.
+		// This is what we will copy.
+		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels, conf.QDisc); err != nil {
+			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
+		}
+	case boot.NetworkHost:
+		// Nothing to do here.
+	default:
+		return fmt.Errorf("invalid network type: %d", conf.Network)
+	}
+	return nil
+}
+
+func createDefaultLoopbackInterface(conn *urpc.Client) error {
+	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{
+		LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink},
+	}, nil); err != nil {
+		return fmt.Errorf("creating loopback link and routes: %v", err)
+	}
+	return nil
+}
+
+func joinNetNS(nsPath string) (func(), error) {
+	runtime.LockOSThread()
+	restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{
+		Type: specs.NetworkNamespace,
+		Path: nsPath,
+	})
+	if err != nil {
+		runtime.UnlockOSThread()
+		return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err)
+	}
+	return func() {
+		restoreNS()
+		runtime.UnlockOSThread()
+	}, nil
+}
+
+// isRootNS determines whether we are running in the root net namespace.
+// /proc/sys/net/core/rmem_default only exists in root network namespace.
+func isRootNS() (bool, error) {
+	err := syscall.Access("/proc/sys/net/core/rmem_default", syscall.F_OK)
+	switch err {
+	case nil:
+		return true, nil
+	case syscall.ENOENT:
+		return false, nil
+	default:
+		return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err)
+	}
+}
+
+// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
+// net namespace with the given path, creates them in the sandbox, and removes
+// them from the host.
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error {
+	// Join the network namespace that we will be copying.
+	restore, err := joinNetNS(nsPath)
+	if err != nil {
+		return err
+	}
+	defer restore()
+
+	// Get all interfaces in the namespace.
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return fmt.Errorf("querying interfaces: %v", err)
+	}
+
+	isRoot, err := isRootNS()
+	if err != nil {
+		return err
+	}
+	if isRoot {
+
+		return fmt.Errorf("cannot run with network enabled in root network namespace")
+	}
+
+	// Collect addresses and routes from the interfaces.
+	var args boot.CreateLinksAndRoutesArgs
+	for _, iface := range ifaces {
+		if iface.Flags&net.FlagUp == 0 {
+			log.Infof("Skipping down interface: %+v", iface)
+			continue
+		}
+
+		allAddrs, err := iface.Addrs()
+		if err != nil {
+			return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err)
+		}
+
+		// We build our own loopback device.
+		if iface.Flags&net.FlagLoopback != 0 {
+			link, err := loopbackLink(iface, allAddrs)
+			if err != nil {
+				return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err)
+			}
+			args.LoopbackLinks = append(args.LoopbackLinks, link)
+			continue
+		}
+
+		var ipAddrs []*net.IPNet
+		for _, ifaddr := range allAddrs {
+			ipNet, ok := ifaddr.(*net.IPNet)
+			if !ok {
+				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
+			}
+			ipAddrs = append(ipAddrs, ipNet)
+		}
+		if len(ipAddrs) == 0 {
+			log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
+			continue
+		}
+
+		// Scrape the routes before removing the address, since that
+		// will remove the routes as well.
+		routes, defv4, defv6, err := routesForIface(iface)
+		if err != nil {
+			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
+		}
+		if defv4 != nil {
+			if !args.Defaultv4Gateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
+			}
+			args.Defaultv4Gateway.Route = *defv4
+			args.Defaultv4Gateway.Name = iface.Name
+		}
+
+		if defv6 != nil {
+			if !args.Defaultv6Gateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
+			}
+			args.Defaultv6Gateway.Route = *defv6
+			args.Defaultv6Gateway.Name = iface.Name
+		}
+
+		link := boot.FDBasedLink{
+			Name:        iface.Name,
+			MTU:         iface.MTU,
+			Routes:      routes,
+			NumChannels: numNetworkChannels,
+			QDisc:       qDisc,
+		}
+
+		// Get the link for the interface.
+		ifaceLink, err := netlink.LinkByName(iface.Name)
+		if err != nil {
+			return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
+		}
+		link.LinkAddress = ifaceLink.Attrs().HardwareAddr
+
+		log.Debugf("Setting up network channels")
+		// Create the socket for the device.
+		for i := 0; i < link.NumChannels; i++ {
+			log.Debugf("Creating Channel %d", i)
+			socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO)
+			if err != nil {
+				return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
+			}
+			if i == 0 {
+				link.GSOMaxSize = socketEntry.gsoMaxSize
+			} else {
+				if link.GSOMaxSize != socketEntry.gsoMaxSize {
+					return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
+						link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
+				}
+			}
+			args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
+		}
+
+		if link.GSOMaxSize == 0 && softwareGSO {
+			// Hardware GSO is disabled. Let's enable software GSO.
+			link.GSOMaxSize = stack.SoftwareGSOMaxSize
+			link.SoftwareGSOEnabled = true
+		}
+
+		// Collect the addresses for the interface, enable forwarding,
+		// and remove them from the host.
+		for _, addr := range ipAddrs {
+			link.Addresses = append(link.Addresses, addr.IP)
+
+			// Steal IP address from NIC.
+			if err := removeAddress(ifaceLink, addr.String()); err != nil {
+				return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err)
+			}
+		}
+
+		args.FDBasedLinks = append(args.FDBasedLinks, link)
+	}
+
+	log.Debugf("Setting up network, config: %+v", args)
+	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
+		return fmt.Errorf("creating links and routes: %v", err)
+	}
+	return nil
+}
+
+type socketEntry struct {
+	deviceFile *os.File
+	gsoMaxSize uint32
+}
+
+// createSocket creates an underlying AF_PACKET socket and configures it for use by
+// the sentry and returns an *os.File that wraps the underlying socket fd.
+func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
+	// Create the socket.
+	const protocol = 0x0300 // htons(ETH_P_ALL)
+	fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+	if err != nil {
+		return nil, fmt.Errorf("unable to create raw socket: %v", err)
+	}
+	deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+	// Bind to the appropriate device.
+	ll := syscall.SockaddrLinklayer{
+		Protocol: protocol,
+		Ifindex:  iface.Index,
+		Hatype:   0, // No ARP type.
+		Pkttype:  syscall.PACKET_OTHERHOST,
+	}
+	if err := syscall.Bind(fd, &ll); err != nil {
+		return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+	}
+
+	gsoMaxSize := uint32(0)
+	if enableGSO {
+		gso, err := isGSOEnabled(fd, iface.Name)
+		if err != nil {
+			return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+		}
+		if gso {
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+				return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+			}
+			gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
+		} else {
+			log.Infof("GSO not available in host.")
+		}
+	}
+
+	// Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer
+	// for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max".
+	// wmem_max/rmem_max default to a unusually low value of 208KB. This is too low
+	// for gVisor to be able to receive packets at high throughputs without
+	// incurring packet drops.
+	const bufSize = 4 << 20 // 4MB.
+
+	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil {
+		return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err)
+	}
+
+	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil {
+		return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err)
+	}
+
+	return &socketEntry{deviceFile, gsoMaxSize}, nil
+}
+
+// loopbackLink returns the link with addresses and routes for a loopback
+// interface.
+func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) {
+	link := boot.LoopbackLink{
+		Name: iface.Name,
+	}
+	for _, addr := range addrs {
+		ipNet, ok := addr.(*net.IPNet)
+		if !ok {
+			return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
+		}
+		dst := *ipNet
+		dst.IP = dst.IP.Mask(dst.Mask)
+		link.Addresses = append(link.Addresses, ipNet.IP)
+		link.Routes = append(link.Routes, boot.Route{
+			Destination: dst,
+		})
+	}
+	return link, nil
+}
+
+// routesForIface iterates over all routes for the given interface and converts
+// them to boot.Routes. It also returns the a default v4/v6 route if found.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
+	link, err := netlink.LinkByIndex(iface.Index)
+	if err != nil {
+		return nil, nil, nil, err
+	}
+	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
+	}
+
+	var defv4, defv6 *boot.Route
+	var routes []boot.Route
+	for _, r := range rs {
+		// Is it a default route?
+		if r.Dst == nil {
+			if r.Gw == nil {
+				return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
+			}
+			// Create a catch all route to the gateway.
+			switch len(r.Gw) {
+			case header.IPv4AddressSize:
+				if defv4 != nil {
+					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
+				}
+				defv4 = &boot.Route{
+					Destination: net.IPNet{
+						IP:   net.IPv4zero,
+						Mask: net.IPMask(net.IPv4zero),
+					},
+					Gateway: r.Gw,
+				}
+			case header.IPv6AddressSize:
+				if defv6 != nil {
+					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
+				}
+
+				defv6 = &boot.Route{
+					Destination: net.IPNet{
+						IP:   net.IPv6zero,
+						Mask: net.IPMask(net.IPv6zero),
+					},
+					Gateway: r.Gw,
+				}
+			default:
+				return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
+			}
+			continue
+		}
+
+		dst := *r.Dst
+		dst.IP = dst.IP.Mask(dst.Mask)
+		routes = append(routes, boot.Route{
+			Destination: dst,
+			Gateway:     r.Gw,
+		})
+	}
+	return routes, defv4, defv6, nil
+}
+
+// removeAddress removes IP address from network device. It's equivalent to:
+//   ip addr del <ipAndMask> dev <name>
+func removeAddress(source netlink.Link, ipAndMask string) error {
+	addr, err := netlink.ParseAddr(ipAndMask)
+	if err != nil {
+		return err
+	}
+	return netlink.AddrDel(source, addr)
+}
diff --git a/runsc/sandbox/network_unsafe.go b/runsc/sandbox/network_unsafe.go
new file mode 100644
index 000000000..2a2a0fb7e
--- /dev/null
+++ b/runsc/sandbox/network_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sandbox
+
+import (
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+type ethtoolValue struct {
+	cmd uint32
+	val uint32
+}
+
+type ifreq struct {
+	ifrName [unix.IFNAMSIZ]byte
+	ifrData *ethtoolValue
+}
+
+const (
+	_ETHTOOL_GGSO = 0x00000023
+)
+
+func isGSOEnabled(fd int, intf string) (bool, error) {
+	val := ethtoolValue{
+		cmd: _ETHTOOL_GGSO,
+	}
+
+	var name [unix.IFNAMSIZ]byte
+	copy(name[:], []byte(intf))
+
+	ifr := ifreq{
+		ifrName: name,
+		ifrData: &val,
+	}
+
+	if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), unix.SIOCETHTOOL, uintptr(unsafe.Pointer(&ifr))); err != 0 {
+		return false, err
+	}
+
+	return val.val != 0, nil
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
new file mode 100644
index 000000000..e4ec16e2f
--- /dev/null
+++ b/runsc/sandbox/sandbox.go
@@ -0,0 +1,1227 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sandbox creates and manipulates sandboxes.
+package sandbox
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
+	"gvisor.dev/gvisor/pkg/control/client"
+	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot"
+	"gvisor.dev/gvisor/runsc/boot/platforms"
+	"gvisor.dev/gvisor/runsc/cgroup"
+	"gvisor.dev/gvisor/runsc/console"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+// Sandbox wraps a sandbox process.
+//
+// It is used to start/stop sandbox process (and associated processes like
+// gofers), as well as for running and manipulating containers inside a running
+// sandbox.
+//
+// Note: Sandbox must be immutable because a copy of it is saved for each
+// container and changes would not be synchronized to all of them.
+type Sandbox struct {
+	// ID is the id of the sandbox (immutable). By convention, this is the same
+	// ID as the first container run in the sandbox.
+	ID string `json:"id"`
+
+	// Pid is the pid of the running sandbox (immutable). May be 0 if the sandbox
+	// is not running.
+	Pid int `json:"pid"`
+
+	// Cgroup has the cgroup configuration for the sandbox.
+	Cgroup *cgroup.Cgroup `json:"cgroup"`
+
+	// child is set if a sandbox process is a child of the current process.
+	//
+	// This field isn't saved to json, because only a creator of sandbox
+	// will have it as a child process.
+	child bool
+
+	// status is an exit status of a sandbox process.
+	status syscall.WaitStatus
+
+	// statusMu protects status.
+	statusMu sync.Mutex
+}
+
+// Args is used to configure a new sandbox.
+type Args struct {
+	// ID is the sandbox unique identifier.
+	ID string
+
+	// Spec is the OCI spec that describes the container.
+	Spec *specs.Spec
+
+	// BundleDir is the directory containing the container bundle.
+	BundleDir string
+
+	// ConsoleSocket is the path to a unix domain socket that will receive
+	// the console FD. It may be empty.
+	ConsoleSocket string
+
+	// UserLog is the filename to send user-visible logs to. It may be empty.
+	UserLog string
+
+	// IOFiles is the list of files that connect to a 9P endpoint for the mounts
+	// points using Gofers. They must be in the same order as mounts appear in
+	// the spec.
+	IOFiles []*os.File
+
+	// MountsFile is a file container mount information from the spec. It's
+	// equivalent to the mounts from the spec, except that all paths have been
+	// resolved to their final absolute location.
+	MountsFile *os.File
+
+	// Gcgroup is the cgroup that the sandbox is part of.
+	Cgroup *cgroup.Cgroup
+
+	// Attached indicates that the sandbox lifecycle is attached with the caller.
+	// If the caller exits, the sandbox should exit too.
+	Attached bool
+}
+
+// New creates the sandbox process. The caller must call Destroy() on the
+// sandbox.
+func New(conf *boot.Config, args *Args) (*Sandbox, error) {
+	s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
+	// The Cleanup object cleans up partially created sandboxes when an error
+	// occurs. Any errors occurring during cleanup itself are ignored.
+	c := specutils.MakeCleanup(func() {
+		err := s.destroy()
+		log.Warningf("error destroying sandbox: %v", err)
+	})
+	defer c.Clean()
+
+	// Create pipe to synchronize when sandbox process has been booted.
+	clientSyncFile, sandboxSyncFile, err := os.Pipe()
+	if err != nil {
+		return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
+	}
+	defer clientSyncFile.Close()
+
+	// Create the sandbox process.
+	err = s.createSandboxProcess(conf, args, sandboxSyncFile)
+	// sandboxSyncFile has to be closed to be able to detect when the sandbox
+	// process exits unexpectedly.
+	sandboxSyncFile.Close()
+	if err != nil {
+		return nil, err
+	}
+
+	// Wait until the sandbox has booted.
+	b := make([]byte, 1)
+	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
+		err := fmt.Errorf("waiting for sandbox to start: %v", err)
+		// If the sandbox failed to start, it may be because the binary
+		// permissions were incorrect. Check the bits and return a more helpful
+		// error message.
+		//
+		// NOTE: The error message is checked because error types are lost over
+		// rpc calls.
+		if strings.Contains(err.Error(), io.EOF.Error()) {
+			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
+				return nil, fmt.Errorf("%v: %v", err, permsErr)
+			}
+		}
+		return nil, err
+	}
+
+	c.Release()
+	return s, nil
+}
+
+// CreateContainer creates a non-root container inside the sandbox.
+func (s *Sandbox) CreateContainer(cid string) error {
+	log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
+	sandboxConn, err := s.sandboxConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to sandbox: %v", err)
+	}
+	defer sandboxConn.Close()
+
+	if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil {
+		return fmt.Errorf("creating non-root container %q: %v", cid, err)
+	}
+	return nil
+}
+
+// StartRoot starts running the root container process inside the sandbox.
+func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
+	log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	// Configure the network.
+	if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
+		return fmt.Errorf("setting up network: %v", err)
+	}
+
+	// Send a message to the sandbox control server to start the root
+	// container.
+	if err := conn.Call(boot.RootContainerStart, &s.ID, nil); err != nil {
+		return fmt.Errorf("starting root container: %v", err)
+	}
+
+	return nil
+}
+
+// StartContainer starts running a non-root container inside the sandbox.
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+	for _, f := range goferFiles {
+		defer f.Close()
+	}
+
+	log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
+	sandboxConn, err := s.sandboxConnect()
+	if err != nil {
+		return fmt.Errorf("couldn't connect to sandbox: %v", err)
+	}
+	defer sandboxConn.Close()
+
+	// The payload must container stdin/stdout/stderr followed by gofer
+	// files.
+	files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...)
+	// Start running the container.
+	args := boot.StartArgs{
+		Spec:        spec,
+		Conf:        conf,
+		CID:         cid,
+		FilePayload: urpc.FilePayload{Files: files},
+	}
+	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
+		return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err)
+	}
+	return nil
+}
+
+// Restore sends the restore call for a container in the sandbox.
+func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error {
+	log.Debugf("Restore sandbox %q", s.ID)
+
+	rf, err := os.Open(filename)
+	if err != nil {
+		return fmt.Errorf("opening restore file %q failed: %v", filename, err)
+	}
+	defer rf.Close()
+
+	opt := boot.RestoreOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{rf},
+		},
+		SandboxID: s.ID,
+	}
+
+	// If the platform needs a device FD we must pass it in.
+	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
+		return err
+	} else if deviceFile != nil {
+		defer deviceFile.Close()
+		opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile)
+	}
+
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	// Configure the network.
+	if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
+		return fmt.Errorf("setting up network: %v", err)
+	}
+
+	// Restore the container and start the root container.
+	if err := conn.Call(boot.ContainerRestore, &opt, nil); err != nil {
+		return fmt.Errorf("restoring container %q: %v", cid, err)
+	}
+
+	return nil
+}
+
+// Processes retrieves the list of processes and associated metadata for a
+// given container in this sandbox.
+func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
+	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return nil, err
+	}
+	defer conn.Close()
+
+	var pl []*control.Process
+	if err := conn.Call(boot.ContainerProcesses, &cid, &pl); err != nil {
+		return nil, fmt.Errorf("retrieving process data from sandbox: %v", err)
+	}
+	return pl, nil
+}
+
+// Execute runs the specified command in the container. It returns the PID of
+// the newly created process.
+func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
+	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return 0, s.connError(err)
+	}
+	defer conn.Close()
+
+	// Send a message to the sandbox control server to start the container.
+	var pid int32
+	if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil {
+		return 0, fmt.Errorf("executing command %q in sandbox: %v", args, err)
+	}
+	return pid, nil
+}
+
+// Event retrieves stats about the sandbox such as memory and CPU utilization.
+func (s *Sandbox) Event(cid string) (*boot.Event, error) {
+	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return nil, err
+	}
+	defer conn.Close()
+
+	var e boot.Event
+	// TODO(b/129292330): Pass in the container id (cid) here. The sandbox
+	// should return events only for that container.
+	if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil {
+		return nil, fmt.Errorf("retrieving event data from sandbox: %v", err)
+	}
+	e.ID = cid
+	return &e, nil
+}
+
+func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
+	log.Debugf("Connecting to sandbox %q", s.ID)
+	conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
+	if err != nil {
+		return nil, s.connError(err)
+	}
+	return conn, nil
+}
+
+func (s *Sandbox) connError(err error) error {
+	return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid, err)
+}
+
+// createSandboxProcess starts the sandbox as a subprocess by running the "boot"
+// command, passing in the bundle dir.
+func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncFile *os.File) error {
+	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
+	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
+	nextFD := 3
+
+	binPath := specutils.ExePath
+	cmd := exec.Command(binPath, conf.ToFlags()...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{}
+
+	// Open the log files to pass to the sandbox as FDs.
+	//
+	// These flags must come BEFORE the "boot" command in cmd.Args.
+	if conf.LogFilename != "" {
+		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		if err != nil {
+			return fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
+		}
+		defer logFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, logFile)
+		cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+	if conf.DebugLog != "" {
+		test := ""
+		if len(conf.TestOnlyTestNameEnv) != 0 {
+			// Fetch test name if one is provided and the test only flag was set.
+			if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+				test = t
+			}
+		}
+
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot", test)
+		if err != nil {
+			return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
+		}
+		defer debugLogFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile)
+		cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+	if conf.PanicLog != "" {
+		test := ""
+		if len(conf.TestOnlyTestNameEnv) != 0 {
+			// Fetch test name if one is provided and the test only flag was set.
+			if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+				test = t
+			}
+		}
+
+		panicLogFile, err := specutils.DebugLogFile(conf.PanicLog, "panic", test)
+		if err != nil {
+			return fmt.Errorf("opening debug log file in %q: %v", conf.PanicLog, err)
+		}
+		defer panicLogFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, panicLogFile)
+		cmd.Args = append(cmd.Args, "--panic-log-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	// Add the "boot" command to the args.
+	//
+	// All flags after this must be for the boot command
+	cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir)
+
+	// Create a socket for the control server and donate it to the sandbox.
+	addr := boot.ControlSocketAddr(s.ID)
+	sockFD, err := server.CreateSocket(addr)
+	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
+	if err != nil {
+		return fmt.Errorf("creating control server socket for sandbox %q: %v", s.ID, err)
+	}
+	controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket")
+	defer controllerFile.Close()
+	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
+	cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
+	defer args.MountsFile.Close()
+	cmd.ExtraFiles = append(cmd.ExtraFiles, args.MountsFile)
+	cmd.Args = append(cmd.Args, "--mounts-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
+	specFile, err := specutils.OpenSpec(args.BundleDir)
+	if err != nil {
+		return err
+	}
+	defer specFile.Close()
+	cmd.ExtraFiles = append(cmd.ExtraFiles, specFile)
+	cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
+	cmd.ExtraFiles = append(cmd.ExtraFiles, startSyncFile)
+	cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD))
+	nextFD++
+
+	// If there is a gofer, sends all socket ends to the sandbox.
+	for _, f := range args.IOFiles {
+		defer f.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+		cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	gPlatform, err := platform.Lookup(conf.Platform)
+	if err != nil {
+		return err
+	}
+
+	if deviceFile, err := gPlatform.OpenDevice(); err != nil {
+		return fmt.Errorf("opening device file for platform %q: %v", gPlatform, err)
+	} else if deviceFile != nil {
+		defer deviceFile.Close()
+		cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile)
+		cmd.Args = append(cmd.Args, "--device-fd="+strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
+	// isn't set.
+	if conf.Platform == "kvm" {
+		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
+	}
+
+	// The current process' stdio must be passed to the application via the
+	// --stdio-fds flag. The stdio of the sandbox process itself must not
+	// be connected to the same FDs, otherwise we risk leaking sandbox
+	// errors to the application, so we set the sandbox stdio to nil,
+	// causing them to read/write from the null device.
+	cmd.Stdin = nil
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+
+	// If the console control socket file is provided, then create a new
+	// pty master/slave pair and set the TTY on the sandbox process.
+	if args.ConsoleSocket != "" {
+		cmd.Args = append(cmd.Args, "--console=true")
+
+		// console.NewWithSocket will send the master on the given
+		// socket, and return the slave.
+		tty, err := console.NewWithSocket(args.ConsoleSocket)
+		if err != nil {
+			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
+		}
+		defer tty.Close()
+
+		// Set the TTY as a controlling TTY on the sandbox process.
+		cmd.SysProcAttr.Setctty = true
+		// The Ctty FD must be the FD in the child process's FD table,
+		// which will be nextFD in this case.
+		// See https://github.com/golang/go/issues/29458.
+		cmd.SysProcAttr.Ctty = nextFD
+
+		// Pass the tty as all stdio fds to sandbox.
+		for i := 0; i < 3; i++ {
+			cmd.ExtraFiles = append(cmd.ExtraFiles, tty)
+			cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
+			nextFD++
+		}
+
+		if conf.Debug {
+			// If debugging, send the boot process stdio to the
+			// TTY, so that it is easier to find.
+			cmd.Stdin = tty
+			cmd.Stdout = tty
+			cmd.Stderr = tty
+		}
+	} else {
+		// If not using a console, pass our current stdio as the
+		// container stdio via flags.
+		for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
+			cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+			cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
+			nextFD++
+		}
+
+		if conf.Debug {
+			// If debugging, send the boot process stdio to the
+			// this process' stdio, so that is is easier to find.
+			cmd.Stdin = os.Stdin
+			cmd.Stdout = os.Stdout
+			cmd.Stderr = os.Stderr
+		}
+	}
+
+	// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
+	// when re-parented.
+	cmd.SysProcAttr.Setsid = true
+
+	// nss is the set of namespaces to join or create before starting the sandbox
+	// process. Mount, IPC and UTS namespaces from the host are not used as they
+	// are virtualized inside the sandbox. Be paranoid and run inside an empty
+	// namespace for these. Don't unshare cgroup because sandbox is added to a
+	// cgroup in the caller's namespace.
+	log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces")
+	nss := []specs.LinuxNamespace{
+		{Type: specs.IPCNamespace},
+		{Type: specs.MountNamespace},
+		{Type: specs.UTSNamespace},
+	}
+
+	if gPlatform.Requirements().RequiresCurrentPIDNS {
+		// TODO(b/75837838): Also set a new PID namespace so that we limit
+		// access to other host processes.
+		log.Infof("Sandbox will be started in the current PID namespace")
+	} else {
+		log.Infof("Sandbox will be started in a new PID namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
+		cmd.Args = append(cmd.Args, "--pidns=true")
+	}
+
+	// Joins the network namespace if network is enabled. the sandbox talks
+	// directly to the host network, which may have been configured in the
+	// namespace.
+	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != boot.NetworkNone {
+		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
+		nss = append(nss, ns)
+	} else if conf.Network == boot.NetworkHost {
+		log.Infof("Sandbox will be started in the host network namespace")
+	} else {
+		log.Infof("Sandbox will be started in new network namespace")
+		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
+	}
+
+	// User namespace depends on the network type. Host network requires to run
+	// inside the user namespace specified in the spec or the current namespace
+	// if none is configured.
+	if conf.Network == boot.NetworkHost {
+		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
+			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
+			nss = append(nss, userns)
+			specutils.SetUIDGIDMappings(cmd, args.Spec)
+		} else {
+			log.Infof("Sandbox will be started in the current user namespace")
+		}
+		// When running in the caller's defined user namespace, apply the same
+		// capabilities to the sandbox process to ensure it abides to the same
+		// rules.
+		cmd.Args = append(cmd.Args, "--apply-caps=true")
+
+		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
+		// bind-mount the executable inside it.
+		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
+
+		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) {
+			log.Infof("Sandbox will be started in minimal chroot")
+			cmd.Args = append(cmd.Args, "--setup-root")
+		} else {
+			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
+		}
+	} else {
+		// If we have CAP_SETUID and CAP_SETGID, then we can also run
+		// as user nobody.
+		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
+			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
+		} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
+			log.Infof("Sandbox will be started in new user namespace")
+			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+			cmd.Args = append(cmd.Args, "--setup-root")
+
+			const nobody = 65534
+			if conf.Rootless {
+				log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
+				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: nobody,
+						HostID:      os.Getuid(),
+						Size:        1,
+					},
+				}
+				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: nobody,
+						HostID:      os.Getgid(),
+						Size:        1,
+					},
+				}
+
+			} else {
+				// Map nobody in the new namespace to nobody in the parent namespace.
+				//
+				// A sandbox process will construct an empty
+				// root for itself, so it has to have
+				// CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities.
+				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: nobody,
+						HostID:      nobody,
+						Size:        1,
+					},
+				}
+				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: nobody,
+						HostID:      nobody,
+						Size:        1,
+					},
+				}
+			}
+
+			// Set credentials to run as user and group nobody.
+			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody}
+			cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, uintptr(capability.CAP_SYS_ADMIN), uintptr(capability.CAP_SYS_CHROOT))
+		} else {
+			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
+		}
+	}
+
+	cmd.Args[0] = "runsc-sandbox"
+
+	if s.Cgroup != nil {
+		cpuNum, err := s.Cgroup.NumCPU()
+		if err != nil {
+			return fmt.Errorf("getting cpu count from cgroups: %v", err)
+		}
+		if conf.CPUNumFromQuota {
+			// Dropping below 2 CPUs can trigger application to disable
+			// locks that can lead do hard to debug errors, so just
+			// leaving two cores as reasonable default.
+			const minCPUs = 2
+
+			quota, err := s.Cgroup.CPUQuota()
+			if err != nil {
+				return fmt.Errorf("getting cpu qouta from cgroups: %v", err)
+			}
+			if n := int(math.Ceil(quota)); n > 0 {
+				if n < minCPUs {
+					n = minCPUs
+				}
+				if n < cpuNum {
+					// Only lower the cpu number.
+					cpuNum = n
+				}
+			}
+		}
+		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
+
+		mem, err := s.Cgroup.MemoryLimit()
+		if err != nil {
+			return fmt.Errorf("getting memory limit from cgroups: %v", err)
+		}
+		// When memory limit is unset, a "large" number is returned. In that case,
+		// just stick with the default.
+		if mem < 0x7ffffffffffff000 {
+			cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10))
+		}
+	}
+
+	if args.UserLog != "" {
+		f, err := os.OpenFile(args.UserLog, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+		if err != nil {
+			return fmt.Errorf("opening compat log file: %v", err)
+		}
+		defer f.Close()
+
+		cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+		cmd.Args = append(cmd.Args, "--user-log-fd", strconv.Itoa(nextFD))
+		nextFD++
+	}
+
+	if args.Attached {
+		// Kill sandbox if parent process exits in attached mode.
+		cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
+		// Tells boot that any process it creates must have pdeathsig set.
+		cmd.Args = append(cmd.Args, "--attached")
+	}
+
+	// Add container as the last argument.
+	cmd.Args = append(cmd.Args, s.ID)
+
+	// Log the FDs we are donating to the sandbox process.
+	for i, f := range cmd.ExtraFiles {
+		log.Debugf("Donating FD %d: %q", i+3, f.Name())
+	}
+
+	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
+	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
+	if err := specutils.StartInNS(cmd, nss); err != nil {
+		err := fmt.Errorf("starting sandbox: %v", err)
+		// If the sandbox failed to start, it may be because the binary
+		// permissions were incorrect. Check the bits and return a more helpful
+		// error message.
+		//
+		// NOTE: The error message is checked because error types are lost over
+		// rpc calls.
+		if strings.Contains(err.Error(), syscall.EACCES.Error()) {
+			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
+				return fmt.Errorf("%v: %v", err, permsErr)
+			}
+		}
+		return err
+	}
+	s.child = true
+	s.Pid = cmd.Process.Pid
+	log.Infof("Sandbox started, PID: %d", s.Pid)
+
+	return nil
+}
+
+// Wait waits for the containerized process to exit, and returns its WaitStatus.
+func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
+	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
+	var ws syscall.WaitStatus
+
+	if conn, err := s.sandboxConnect(); err != nil {
+		// The sandbox may have exited while before we had a chance to
+		// wait on it.
+		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+	} else {
+		defer conn.Close()
+		// Try the Wait RPC to the sandbox.
+		err = conn.Call(boot.ContainerWait, &cid, &ws)
+		if err == nil {
+			// It worked!
+			return ws, nil
+		}
+		// The sandbox may have exited after we connected, but before
+		// or during the Wait RPC.
+		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+	}
+
+	// The sandbox may have already exited, or exited while handling the
+	// Wait RPC. The best we can do is ask Linux what the sandbox exit
+	// status was, since in most cases that will be the same as the
+	// container exit status.
+	if err := s.waitForStopped(); err != nil {
+		return ws, err
+	}
+	if !s.child {
+		return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable")
+	}
+	return s.status, nil
+}
+
+// WaitPID waits for process 'pid' in the container's sandbox and returns its
+// WaitStatus.
+func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) {
+	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
+	var ws syscall.WaitStatus
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return ws, err
+	}
+	defer conn.Close()
+
+	args := &boot.WaitPIDArgs{
+		PID: pid,
+		CID: cid,
+	}
+	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
+		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
+	}
+	return ws, nil
+}
+
+// IsRootContainer returns true if the specified container ID belongs to the
+// root container.
+func (s *Sandbox) IsRootContainer(cid string) bool {
+	return s.ID == cid
+}
+
+// Destroy frees all resources associated with the sandbox. It fails fast and
+// is idempotent.
+func (s *Sandbox) destroy() error {
+	log.Debugf("Destroy sandbox %q", s.ID)
+	if s.Pid != 0 {
+		log.Debugf("Killing sandbox %q", s.ID)
+		if err := syscall.Kill(s.Pid, syscall.SIGKILL); err != nil && err != syscall.ESRCH {
+			return fmt.Errorf("killing sandbox %q PID %q: %v", s.ID, s.Pid, err)
+		}
+		if err := s.waitForStopped(); err != nil {
+			return fmt.Errorf("waiting sandbox %q stop: %v", s.ID, err)
+		}
+	}
+
+	return nil
+}
+
+// SignalContainer sends the signal to a container in the sandbox. If all is
+// true and signal is SIGKILL, then waits for all processes to exit before
+// returning.
+func (s *Sandbox) SignalContainer(cid string, sig syscall.Signal, all bool) error {
+	log.Debugf("Signal sandbox %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	mode := boot.DeliverToProcess
+	if all {
+		mode = boot.DeliverToAllProcesses
+	}
+
+	args := boot.SignalArgs{
+		CID:   cid,
+		Signo: int32(sig),
+		Mode:  mode,
+	}
+	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
+		return fmt.Errorf("signaling container %q: %v", cid, err)
+	}
+	return nil
+}
+
+// SignalProcess sends the signal to a particular process in the container. If
+// fgProcess is true, then the signal is sent to the foreground process group
+// in the same session that PID belongs to. This is only valid if the process
+// is attached to a host TTY.
+func (s *Sandbox) SignalProcess(cid string, pid int32, sig syscall.Signal, fgProcess bool) error {
+	log.Debugf("Signal sandbox %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	mode := boot.DeliverToProcess
+	if fgProcess {
+		mode = boot.DeliverToForegroundProcessGroup
+	}
+
+	args := boot.SignalArgs{
+		CID:   cid,
+		Signo: int32(sig),
+		PID:   pid,
+		Mode:  mode,
+	}
+	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
+		return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err)
+	}
+	return nil
+}
+
+// Checkpoint sends the checkpoint call for a container in the sandbox.
+// The statefile will be written to f.
+func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
+	log.Debugf("Checkpoint sandbox %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opt := control.SaveOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+
+	if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil {
+		return fmt.Errorf("checkpointing container %q: %v", cid, err)
+	}
+	return nil
+}
+
+// Pause sends the pause call for a container in the sandbox.
+func (s *Sandbox) Pause(cid string) error {
+	log.Debugf("Pause sandbox %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.ContainerPause, nil, nil); err != nil {
+		return fmt.Errorf("pausing container %q: %v", cid, err)
+	}
+	return nil
+}
+
+// Resume sends the resume call for a container in the sandbox.
+func (s *Sandbox) Resume(cid string) error {
+	log.Debugf("Resume sandbox %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.ContainerResume, nil, nil); err != nil {
+		return fmt.Errorf("resuming container %q: %v", cid, err)
+	}
+	return nil
+}
+
+// IsRunning returns true if the sandbox or gofer process is running.
+func (s *Sandbox) IsRunning() bool {
+	if s.Pid != 0 {
+		// Send a signal 0 to the sandbox process.
+		if err := syscall.Kill(s.Pid, 0); err == nil {
+			// Succeeded, process is running.
+			return true
+		}
+	}
+	return false
+}
+
+// Stacks collects and returns all stacks for the sandbox.
+func (s *Sandbox) Stacks() (string, error) {
+	log.Debugf("Stacks sandbox %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return "", err
+	}
+	defer conn.Close()
+
+	var stacks string
+	if err := conn.Call(boot.SandboxStacks, nil, &stacks); err != nil {
+		return "", fmt.Errorf("getting sandbox %q stacks: %v", s.ID, err)
+	}
+	return stacks, nil
+}
+
+// HeapProfile writes a heap profile to the given file.
+func (s *Sandbox) HeapProfile(f *os.File) error {
+	log.Debugf("Heap profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.HeapProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q heap profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StartCPUProfile start CPU profile writing to the given file.
+func (s *Sandbox) StartCPUProfile(f *os.File) error {
+	log.Debugf("CPU profile start %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.StartCPUProfile, &opts, nil); err != nil {
+		return fmt.Errorf("starting sandbox %q CPU profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StopCPUProfile stops a previously started CPU profile.
+func (s *Sandbox) StopCPUProfile() error {
+	log.Debugf("CPU profile stop %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.StopCPUProfile, nil, nil); err != nil {
+		return fmt.Errorf("stopping sandbox %q CPU profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// GoroutineProfile writes a goroutine profile to the given file.
+func (s *Sandbox) GoroutineProfile(f *os.File) error {
+	log.Debugf("Goroutine profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// BlockProfile writes a block profile to the given file.
+func (s *Sandbox) BlockProfile(f *os.File) error {
+	log.Debugf("Block profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.BlockProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q block profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// MutexProfile writes a mutex profile to the given file.
+func (s *Sandbox) MutexProfile(f *os.File) error {
+	log.Debugf("Mutex profile %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.MutexProfile, &opts, nil); err != nil {
+		return fmt.Errorf("getting sandbox %q mutex profile: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StartTrace start trace  writing to the given file.
+func (s *Sandbox) StartTrace(f *os.File) error {
+	log.Debugf("Trace start %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	opts := control.ProfileOpts{
+		FilePayload: urpc.FilePayload{
+			Files: []*os.File{f},
+		},
+	}
+	if err := conn.Call(boot.StartTrace, &opts, nil); err != nil {
+		return fmt.Errorf("starting sandbox %q trace: %v", s.ID, err)
+	}
+	return nil
+}
+
+// StopTrace stops a previously started trace.
+func (s *Sandbox) StopTrace() error {
+	log.Debugf("Trace stop %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.StopTrace, nil, nil); err != nil {
+		return fmt.Errorf("stopping sandbox %q trace: %v", s.ID, err)
+	}
+	return nil
+}
+
+// ChangeLogging changes logging options.
+func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
+	log.Debugf("Change logging start %q", s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	if err := conn.Call(boot.ChangeLogging, &args, nil); err != nil {
+		return fmt.Errorf("changing sandbox %q logging: %v", s.ID, err)
+	}
+	return nil
+}
+
+// DestroyContainer destroys the given container. If it is the root container,
+// then the entire sandbox is destroyed.
+func (s *Sandbox) DestroyContainer(cid string) error {
+	if err := s.destroyContainer(cid); err != nil {
+		// If the sandbox isn't running, the container has already been destroyed,
+		// ignore the error in this case.
+		if s.IsRunning() {
+			return err
+		}
+	}
+	return nil
+}
+
+func (s *Sandbox) destroyContainer(cid string) error {
+	if s.IsRootContainer(cid) {
+		log.Debugf("Destroying root container %q by destroying sandbox", cid)
+		return s.destroy()
+	}
+
+	log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
+	conn, err := s.sandboxConnect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+	if err := conn.Call(boot.ContainerDestroy, &cid, nil); err != nil {
+		return fmt.Errorf("destroying container %q: %v", cid, err)
+	}
+	return nil
+}
+
+func (s *Sandbox) waitForStopped() error {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+	op := func() error {
+		if s.child {
+			s.statusMu.Lock()
+			defer s.statusMu.Unlock()
+			if s.Pid == 0 {
+				return nil
+			}
+			// The sandbox process is a child of the current process,
+			// so we can wait it and collect its zombie.
+			wpid, err := syscall.Wait4(int(s.Pid), &s.status, syscall.WNOHANG, nil)
+			if err != nil {
+				return fmt.Errorf("error waiting the sandbox process: %v", err)
+			}
+			if wpid == 0 {
+				return fmt.Errorf("sandbox is still running")
+			}
+			s.Pid = 0
+		} else if s.IsRunning() {
+			return fmt.Errorf("sandbox is still running")
+		}
+		return nil
+	}
+	return backoff.Retry(op, b)
+}
+
+// deviceFileForPlatform opens the device file for the given platform. If the
+// platform does not need a device file, then nil is returned.
+func deviceFileForPlatform(name string) (*os.File, error) {
+	p, err := platform.Lookup(name)
+	if err != nil {
+		return nil, err
+	}
+
+	f, err := p.OpenDevice()
+	if err != nil {
+		return nil, fmt.Errorf("opening device file for platform %q: %v", p, err)
+	}
+	return f, nil
+}
+
+// checkBinaryPermissions verifies that the required binary bits are set on
+// the runsc executable.
+func checkBinaryPermissions(conf *boot.Config) error {
+	// All platforms need the other exe bit
+	neededBits := os.FileMode(0001)
+	if conf.Platform == platforms.Ptrace {
+		// Ptrace needs the other read bit
+		neededBits |= os.FileMode(0004)
+	}
+
+	exePath, err := os.Executable()
+	if err != nil {
+		return fmt.Errorf("getting exe path: %v", err)
+	}
+
+	// Check the permissions of the runsc binary and print an error if it
+	// doesn't match expectations.
+	info, err := os.Stat(exePath)
+	if err != nil {
+		return fmt.Errorf("stat file: %v", err)
+	}
+
+	if info.Mode().Perm()&neededBits != neededBits {
+		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
+	}
+	return nil
+}
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
new file mode 100644
index 000000000..4ccd77f63
--- /dev/null
+++ b/runsc/specutils/BUILD
@@ -0,0 +1,32 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "specutils",
+    srcs = [
+        "cri.go",
+        "fs.go",
+        "namespace.go",
+        "specutils.go",
+    ],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/bits",
+        "//pkg/log",
+        "//pkg/sentry/kernel/auth",
+        "@com_github_cenkalti_backoff//:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "specutils_test",
+    size = "small",
+    srcs = ["specutils_test.go"],
+    library = ":specutils",
+    deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"],
+)
diff --git a/runsc/specutils/cri.go b/runsc/specutils/cri.go
new file mode 100644
index 000000000..9c5877cd5
--- /dev/null
+++ b/runsc/specutils/cri.go
@@ -0,0 +1,110 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+const (
+	// ContainerdContainerTypeAnnotation is the OCI annotation set by
+	// containerd to indicate whether the container to create should have
+	// its own sandbox or a container within an existing sandbox.
+	ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+	// ContainerdContainerTypeContainer is the container type value
+	// indicating the container should be created in an existing sandbox.
+	ContainerdContainerTypeContainer = "container"
+	// ContainerdContainerTypeSandbox is the container type value
+	// indicating the container should be created in a new sandbox.
+	ContainerdContainerTypeSandbox = "sandbox"
+
+	// ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
+	// which sandbox the container should be created in when the container
+	// is not the first container in the sandbox.
+	ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
+
+	// CRIOContainerTypeAnnotation is the OCI annotation set by
+	// CRI-O to indicate whether the container to create should have
+	// its own sandbox or a container within an existing sandbox.
+	CRIOContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType"
+
+	// CRIOContainerTypeContainer is the container type value
+	// indicating the container should be created in an existing sandbox.
+	CRIOContainerTypeContainer = "container"
+	// CRIOContainerTypeSandbox is the container type value
+	// indicating the container should be created in a new sandbox.
+	CRIOContainerTypeSandbox = "sandbox"
+
+	// CRIOSandboxIDAnnotation is the OCI annotation set to indicate
+	// which sandbox the container should be created in when the container
+	// is not the first container in the sandbox.
+	CRIOSandboxIDAnnotation = "io.kubernetes.cri-o.SandboxID"
+)
+
+// ContainerType represents the type of container requested by the calling container manager.
+type ContainerType int
+
+const (
+	// ContainerTypeUnspecified indicates that no known container type
+	// annotation was found in the spec.
+	ContainerTypeUnspecified ContainerType = iota
+	// ContainerTypeUnknown indicates that a container type was specified
+	// but is unknown to us.
+	ContainerTypeUnknown
+	// ContainerTypeSandbox indicates that the container should be run in a
+	// new sandbox.
+	ContainerTypeSandbox
+	// ContainerTypeContainer indicates that the container should be run in
+	// an existing sandbox.
+	ContainerTypeContainer
+)
+
+// SpecContainerType tries to determine the type of container specified by the
+// container manager using well-known container annotations.
+func SpecContainerType(spec *specs.Spec) ContainerType {
+	if t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]; ok {
+		switch t {
+		case ContainerdContainerTypeSandbox:
+			return ContainerTypeSandbox
+		case ContainerdContainerTypeContainer:
+			return ContainerTypeContainer
+		default:
+			return ContainerTypeUnknown
+		}
+	}
+	if t, ok := spec.Annotations[CRIOContainerTypeAnnotation]; ok {
+		switch t {
+		case CRIOContainerTypeSandbox:
+			return ContainerTypeSandbox
+		case CRIOContainerTypeContainer:
+			return ContainerTypeContainer
+		default:
+			return ContainerTypeUnknown
+		}
+	}
+	return ContainerTypeUnspecified
+}
+
+// SandboxID returns the ID of the sandbox to join and whether an ID was found
+// in the spec.
+func SandboxID(spec *specs.Spec) (string, bool) {
+	if id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]; ok {
+		return id, true
+	}
+	if id, ok := spec.Annotations[CRIOSandboxIDAnnotation]; ok {
+		return id, true
+	}
+	return "", false
+}
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
new file mode 100644
index 000000000..138aa4dd1
--- /dev/null
+++ b/runsc/specutils/fs.go
@@ -0,0 +1,155 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	"fmt"
+	"math/bits"
+	"path"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+type mapping struct {
+	set bool
+	val uint32
+}
+
+// optionsMap maps mount propagation-related OCI filesystem options to mount(2)
+// syscall flags.
+var optionsMap = map[string]mapping{
+	"acl":           {set: true, val: syscall.MS_POSIXACL},
+	"async":         {set: false, val: syscall.MS_SYNCHRONOUS},
+	"atime":         {set: false, val: syscall.MS_NOATIME},
+	"bind":          {set: true, val: syscall.MS_BIND},
+	"defaults":      {set: true, val: 0},
+	"dev":           {set: false, val: syscall.MS_NODEV},
+	"diratime":      {set: false, val: syscall.MS_NODIRATIME},
+	"dirsync":       {set: true, val: syscall.MS_DIRSYNC},
+	"exec":          {set: false, val: syscall.MS_NOEXEC},
+	"noexec":        {set: true, val: syscall.MS_NOEXEC},
+	"iversion":      {set: true, val: syscall.MS_I_VERSION},
+	"loud":          {set: false, val: syscall.MS_SILENT},
+	"mand":          {set: true, val: syscall.MS_MANDLOCK},
+	"noacl":         {set: false, val: syscall.MS_POSIXACL},
+	"noatime":       {set: true, val: syscall.MS_NOATIME},
+	"nodev":         {set: true, val: syscall.MS_NODEV},
+	"nodiratime":    {set: true, val: syscall.MS_NODIRATIME},
+	"noiversion":    {set: false, val: syscall.MS_I_VERSION},
+	"nomand":        {set: false, val: syscall.MS_MANDLOCK},
+	"norelatime":    {set: false, val: syscall.MS_RELATIME},
+	"nostrictatime": {set: false, val: syscall.MS_STRICTATIME},
+	"nosuid":        {set: true, val: syscall.MS_NOSUID},
+	"rbind":         {set: true, val: syscall.MS_BIND | syscall.MS_REC},
+	"relatime":      {set: true, val: syscall.MS_RELATIME},
+	"remount":       {set: true, val: syscall.MS_REMOUNT},
+	"ro":            {set: true, val: syscall.MS_RDONLY},
+	"rw":            {set: false, val: syscall.MS_RDONLY},
+	"silent":        {set: true, val: syscall.MS_SILENT},
+	"strictatime":   {set: true, val: syscall.MS_STRICTATIME},
+	"suid":          {set: false, val: syscall.MS_NOSUID},
+	"sync":          {set: true, val: syscall.MS_SYNCHRONOUS},
+}
+
+// propOptionsMap is similar to optionsMap, but it lists propagation options
+// that cannot be used together with other flags.
+var propOptionsMap = map[string]mapping{
+	"private":     {set: true, val: syscall.MS_PRIVATE},
+	"rprivate":    {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC},
+	"slave":       {set: true, val: syscall.MS_SLAVE},
+	"rslave":      {set: true, val: syscall.MS_SLAVE | syscall.MS_REC},
+	"unbindable":  {set: true, val: syscall.MS_UNBINDABLE},
+	"runbindable": {set: true, val: syscall.MS_UNBINDABLE | syscall.MS_REC},
+}
+
+// invalidOptions list options not allowed.
+//   - shared: sandbox must be isolated from the host. Propagating mount changes
+//     from the sandbox to the host breaks the isolation.
+var invalidOptions = []string{"shared", "rshared"}
+
+// OptionsToFlags converts mount options to syscall flags.
+func OptionsToFlags(opts []string) uint32 {
+	return optionsToFlags(opts, optionsMap)
+}
+
+// PropOptionsToFlags converts propagation mount options to syscall flags.
+// Propagation options cannot be set other with other options and must be
+// handled separately.
+func PropOptionsToFlags(opts []string) uint32 {
+	return optionsToFlags(opts, propOptionsMap)
+}
+
+func optionsToFlags(opts []string, source map[string]mapping) uint32 {
+	var rv uint32
+	for _, opt := range opts {
+		if m, ok := source[opt]; ok {
+			if m.set {
+				rv |= m.val
+			} else {
+				rv ^= m.val
+			}
+		}
+	}
+	return rv
+}
+
+// validateMount validates that spec mounts are correct.
+func validateMount(mnt *specs.Mount) error {
+	if !path.IsAbs(mnt.Destination) {
+		return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
+	}
+	if mnt.Type == "bind" {
+		return ValidateMountOptions(mnt.Options)
+	}
+	return nil
+}
+
+// ValidateMountOptions validates that mount options are correct.
+func ValidateMountOptions(opts []string) error {
+	for _, o := range opts {
+		if ContainsStr(invalidOptions, o) {
+			return fmt.Errorf("mount option %q is not supported", o)
+		}
+		_, ok1 := optionsMap[o]
+		_, ok2 := propOptionsMap[o]
+		if !ok1 && !ok2 {
+			return fmt.Errorf("unknown mount option %q", o)
+		}
+		if err := validatePropagation(o); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// ValidateRootfsPropagation validates that rootfs propagation options are
+// correct.
+func validateRootfsPropagation(opt string) error {
+	flags := PropOptionsToFlags([]string{opt})
+	if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
+		return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
+	}
+	return validatePropagation(opt)
+}
+
+func validatePropagation(opt string) error {
+	flags := PropOptionsToFlags([]string{opt})
+	exclusive := flags & (syscall.MS_SLAVE | syscall.MS_PRIVATE | syscall.MS_SHARED | syscall.MS_UNBINDABLE)
+	if bits.OnesCount32(exclusive) > 1 {
+		return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt)
+	}
+	return nil
+}
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
new file mode 100644
index 000000000..60bb7b7ee
--- /dev/null
+++ b/runsc/specutils/namespace.go
@@ -0,0 +1,277 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/syndtr/gocapability/capability"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+// nsCloneFlag returns the clone flag that can be used to set a namespace of
+// the given type.
+func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
+	switch nst {
+	case specs.IPCNamespace:
+		return unix.CLONE_NEWIPC
+	case specs.MountNamespace:
+		return unix.CLONE_NEWNS
+	case specs.NetworkNamespace:
+		return unix.CLONE_NEWNET
+	case specs.PIDNamespace:
+		return unix.CLONE_NEWPID
+	case specs.UTSNamespace:
+		return unix.CLONE_NEWUTS
+	case specs.UserNamespace:
+		return unix.CLONE_NEWUSER
+	case specs.CgroupNamespace:
+		return unix.CLONE_NEWCGROUP
+	default:
+		panic(fmt.Sprintf("unknown namespace %v", nst))
+	}
+}
+
+// nsPath returns the path of the namespace for the current process and the
+// given namespace.
+func nsPath(nst specs.LinuxNamespaceType) string {
+	base := "/proc/self/ns"
+	switch nst {
+	case specs.CgroupNamespace:
+		return filepath.Join(base, "cgroup")
+	case specs.IPCNamespace:
+		return filepath.Join(base, "ipc")
+	case specs.MountNamespace:
+		return filepath.Join(base, "mnt")
+	case specs.NetworkNamespace:
+		return filepath.Join(base, "net")
+	case specs.PIDNamespace:
+		return filepath.Join(base, "pid")
+	case specs.UserNamespace:
+		return filepath.Join(base, "user")
+	case specs.UTSNamespace:
+		return filepath.Join(base, "uts")
+	default:
+		panic(fmt.Sprintf("unknown namespace %v", nst))
+	}
+}
+
+// GetNS returns true and the namespace with the given type from the slice of
+// namespaces in the spec.  It returns false if the slice does not contain a
+// namespace with the type.
+func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) {
+	if s.Linux == nil {
+		return specs.LinuxNamespace{}, false
+	}
+	for _, ns := range s.Linux.Namespaces {
+		if ns.Type == nst {
+			return ns, true
+		}
+	}
+	return specs.LinuxNamespace{}, false
+}
+
+// FilterNS returns a slice of namespaces from the spec with types that match
+// those in the `filter` slice.
+func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace {
+	if s.Linux == nil {
+		return nil
+	}
+	var out []specs.LinuxNamespace
+	for _, nst := range filter {
+		if ns, ok := GetNS(nst, s); ok {
+			out = append(out, ns)
+		}
+	}
+	return out
+}
+
+// setNS sets the namespace of the given type.  It must be called with
+// OSThreadLocked.
+func setNS(fd, nsType uintptr) error {
+	if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 {
+		return err
+	}
+	return nil
+}
+
+// ApplyNS applies the namespace on the current thread and returns a function
+// that will restore the namespace to the original value.
+//
+// Preconditions: Must be called with os thread locked.
+func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
+	log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
+	newNS, err := os.Open(ns.Path)
+	if err != nil {
+		return nil, fmt.Errorf("error opening %q: %v", ns.Path, err)
+	}
+	defer newNS.Close()
+
+	// Store current namespace to restore back.
+	curPath := nsPath(ns.Type)
+	oldNS, err := os.Open(curPath)
+	if err != nil {
+		return nil, fmt.Errorf("error opening %q: %v", curPath, err)
+	}
+
+	// Set namespace to the one requested and setup function to restore it back.
+	flag := nsCloneFlag(ns.Type)
+	if err := setNS(newNS.Fd(), flag); err != nil {
+		oldNS.Close()
+		return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
+	}
+	return func() {
+		log.Infof("Restoring namespace %v", ns.Type)
+		defer oldNS.Close()
+		if err := setNS(oldNS.Fd(), flag); err != nil {
+			panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
+		}
+	}, nil
+}
+
+// StartInNS joins or creates the given namespaces and calls cmd.Start before
+// restoring the namespaces to the original values.
+func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
+	// We are about to setup namespaces, which requires the os thread being
+	// locked so that Go doesn't change the thread out from under us.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if cmd.SysProcAttr == nil {
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+
+	for _, ns := range nss {
+		if ns.Path == "" {
+			// No path.  Just set a flag to create a new namespace.
+			cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type)
+			continue
+		}
+		// Join the given namespace, and restore the current namespace
+		// before exiting.
+		restoreNS, err := ApplyNS(ns)
+		if err != nil {
+			return err
+		}
+		defer restoreNS()
+	}
+
+	return cmd.Start()
+}
+
+// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.
+func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
+	if s.Linux == nil {
+		return
+	}
+	if cmd.SysProcAttr == nil {
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+	}
+	for _, idMap := range s.Linux.UIDMappings {
+		log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+		cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{
+			ContainerID: int(idMap.ContainerID),
+			HostID:      int(idMap.HostID),
+			Size:        int(idMap.Size),
+		})
+	}
+	for _, idMap := range s.Linux.GIDMappings {
+		log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size)
+		cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{
+			ContainerID: int(idMap.ContainerID),
+			HostID:      int(idMap.HostID),
+			Size:        int(idMap.Size),
+		})
+	}
+}
+
+// HasCapabilities returns true if the user has all capabilities in 'cs'.
+func HasCapabilities(cs ...capability.Cap) bool {
+	caps, err := capability.NewPid2(os.Getpid())
+	if err != nil {
+		return false
+	}
+	if err := caps.Load(); err != nil {
+		return false
+	}
+	for _, c := range cs {
+		if !caps.Get(capability.EFFECTIVE, c) {
+			return false
+		}
+	}
+	return true
+}
+
+// MaybeRunAsRoot ensures the process runs with capabilities needed to create a
+// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
+// it will create a new user namespace and re-execute the process as root
+// inside the namespace with the same arguments and environment.
+//
+// This function returns immediately when no new capability is needed. If
+// another process is executed, it returns straight from here with the same exit
+// code as the child.
+func MaybeRunAsRoot() error {
+	if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
+		return nil
+	}
+
+	// Current process doesn't have required capabilities, create user namespace
+	// and run as root inside the namespace to acquire capabilities.
+	log.Infof("*** Re-running as root in new user namespace ***")
+
+	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
+
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+		// Set current user/group as root inside the namespace. Since we may not
+		// have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
+		UidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
+		},
+		GidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
+		},
+		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
+		GidMappingsEnableSetgroups: false,
+
+		// Make sure child is killed when the parent terminates.
+		Pdeathsig: syscall.SIGKILL,
+	}
+
+	cmd.Env = os.Environ()
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		if exit, ok := err.(*exec.ExitError); ok {
+			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
+				os.Exit(ws.ExitStatus())
+			}
+			log.Warningf("No wait status provided, exiting with -1: %v", err)
+			os.Exit(-1)
+		}
+		return fmt.Errorf("re-executing self: %v", err)
+	}
+	// Child completed with success.
+	os.Exit(0)
+	panic("unreachable")
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
new file mode 100644
index 000000000..202518b58
--- /dev/null
+++ b/runsc/specutils/specutils.go
@@ -0,0 +1,553 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package specutils contains utility functions for working with OCI runtime
+// specs.
+package specutils
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/cenkalti/backoff"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// ExePath must point to runsc binary, which is normally the same binary. It's
+// changed in tests that aren't linked in the same binary.
+var ExePath = "/proc/self/exe"
+
+// Version is the supported spec version.
+var Version = specs.Version
+
+// LogSpec logs the spec in a human-friendly way.
+func LogSpec(spec *specs.Spec) {
+	log.Debugf("Spec: %+v", spec)
+	log.Debugf("Spec.Hooks: %+v", spec.Hooks)
+	log.Debugf("Spec.Linux: %+v", spec.Linux)
+	if spec.Linux != nil && spec.Linux.Resources != nil {
+		res := spec.Linux.Resources
+		log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory)
+		log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU)
+		log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO)
+		log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network)
+	}
+	log.Debugf("Spec.Process: %+v", spec.Process)
+	log.Debugf("Spec.Root: %+v", spec.Root)
+	log.Debugf("Spec.Mounts: %+v", spec.Mounts)
+}
+
+// ValidateSpec validates that the spec is compatible with runsc.
+func ValidateSpec(spec *specs.Spec) error {
+	// Mandatory fields.
+	if spec.Process == nil {
+		return fmt.Errorf("Spec.Process must be defined: %+v", spec)
+	}
+	if len(spec.Process.Args) == 0 {
+		return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process)
+	}
+	if spec.Root == nil {
+		return fmt.Errorf("Spec.Root must be defined: %+v", spec)
+	}
+	if len(spec.Root.Path) == 0 {
+		return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root)
+	}
+
+	// Unsupported fields.
+	if spec.Solaris != nil {
+		return fmt.Errorf("Spec.Solaris is not supported: %+v", spec)
+	}
+	if spec.Windows != nil {
+		return fmt.Errorf("Spec.Windows is not supported: %+v", spec)
+	}
+	if len(spec.Process.SelinuxLabel) != 0 {
+		return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
+	}
+
+	// Docker uses AppArmor by default, so just log that it's being ignored.
+	if spec.Process.ApparmorProfile != "" {
+		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
+	}
+
+	// PR_SET_NO_NEW_PRIVS is assumed to always be set.
+	// See kernel.Task.updateCredsForExecLocked.
+	if !spec.Process.NoNewPrivileges {
+		log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
+	}
+
+	// TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
+	if spec.Linux != nil && spec.Linux.Seccomp != nil {
+		log.Warningf("Seccomp spec is being ignored")
+	}
+
+	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
+		if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
+			return err
+		}
+	}
+	for _, m := range spec.Mounts {
+		if err := validateMount(&m); err != nil {
+			return err
+		}
+	}
+
+	// CRI specifies whether a container should start a new sandbox, or run
+	// another container in an existing sandbox.
+	switch SpecContainerType(spec) {
+	case ContainerTypeContainer:
+		// When starting a container in an existing sandbox, the
+		// sandbox ID must be set.
+		if _, ok := SandboxID(spec); !ok {
+			return fmt.Errorf("spec has container-type of container, but no sandbox ID set")
+		}
+	case ContainerTypeUnknown:
+		return fmt.Errorf("unknown container-type")
+	default:
+	}
+
+	return nil
+}
+
+// absPath turns the given path into an absolute path (if it is not already
+// absolute) by prepending the base path.
+func absPath(base, rel string) string {
+	if filepath.IsAbs(rel) {
+		return rel
+	}
+	return filepath.Join(base, rel)
+}
+
+// OpenSpec opens an OCI runtime spec from the given bundle directory.
+func OpenSpec(bundleDir string) (*os.File, error) {
+	// The spec file must be named "config.json" inside the bundle directory.
+	return os.Open(filepath.Join(bundleDir, "config.json"))
+}
+
+// ReadSpec reads an OCI runtime spec from the given bundle directory.
+// ReadSpec also normalizes all potential relative paths into absolute
+// path, e.g. spec.Root.Path, mount.Source.
+func ReadSpec(bundleDir string) (*specs.Spec, error) {
+	specFile, err := OpenSpec(bundleDir)
+	if err != nil {
+		return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
+	}
+	defer specFile.Close()
+	return ReadSpecFromFile(bundleDir, specFile)
+}
+
+// ReadSpecFromFile reads an OCI runtime spec from the given File, and
+// normalizes all relative paths into absolute by prepending the bundle dir.
+func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) {
+	if _, err := specFile.Seek(0, os.SEEK_SET); err != nil {
+		return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
+	}
+	specBytes, err := ioutil.ReadAll(specFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err)
+	}
+	var spec specs.Spec
+	if err := json.Unmarshal(specBytes, &spec); err != nil {
+		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes))
+	}
+	if err := ValidateSpec(&spec); err != nil {
+		return nil, err
+	}
+	// Turn any relative paths in the spec to absolute by prepending the bundleDir.
+	spec.Root.Path = absPath(bundleDir, spec.Root.Path)
+	for i := range spec.Mounts {
+		m := &spec.Mounts[i]
+		if m.Source != "" {
+			m.Source = absPath(bundleDir, m.Source)
+		}
+	}
+	return &spec, nil
+}
+
+// ReadMounts reads mount list from a file.
+func ReadMounts(f *os.File) ([]specs.Mount, error) {
+	bytes, err := ioutil.ReadAll(f)
+	if err != nil {
+		return nil, fmt.Errorf("error reading mounts: %v", err)
+	}
+	var mounts []specs.Mount
+	if err := json.Unmarshal(bytes, &mounts); err != nil {
+		return nil, fmt.Errorf("error unmarshaling mounts: %v\n %s", err, string(bytes))
+	}
+	return mounts, nil
+}
+
+// Capabilities takes in spec and returns a TaskCapabilities corresponding to
+// the spec.
+func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+	// Strip CAP_NET_RAW from all capability sets if necessary.
+	skipSet := map[linux.Capability]struct{}{}
+	if !enableRaw {
+		skipSet[linux.CAP_NET_RAW] = struct{}{}
+	}
+
+	var caps auth.TaskCapabilities
+	if specCaps != nil {
+		var err error
+		if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil {
+			return nil, err
+		}
+		if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil {
+			return nil, err
+		}
+		if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil {
+			return nil, err
+		}
+		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
+			return nil, err
+		}
+		// TODO(nlacasse): Support ambient capabilities.
+	}
+	return &caps, nil
+}
+
+// AllCapabilities returns a LinuxCapabilities struct with all capabilities.
+func AllCapabilities() *specs.LinuxCapabilities {
+	var names []string
+	for n := range capFromName {
+		names = append(names, n)
+	}
+	return &specs.LinuxCapabilities{
+		Bounding:    names,
+		Effective:   names,
+		Inheritable: names,
+		Permitted:   names,
+		Ambient:     names,
+	}
+}
+
+// AllCapabilitiesUint64 returns a bitmask containing all capabilities set.
+func AllCapabilitiesUint64() uint64 {
+	var rv uint64
+	for _, cap := range capFromName {
+		rv |= bits.MaskOf64(int(cap))
+	}
+	return rv
+}
+
+var capFromName = map[string]linux.Capability{
+	"CAP_CHOWN":            linux.CAP_CHOWN,
+	"CAP_DAC_OVERRIDE":     linux.CAP_DAC_OVERRIDE,
+	"CAP_DAC_READ_SEARCH":  linux.CAP_DAC_READ_SEARCH,
+	"CAP_FOWNER":           linux.CAP_FOWNER,
+	"CAP_FSETID":           linux.CAP_FSETID,
+	"CAP_KILL":             linux.CAP_KILL,
+	"CAP_SETGID":           linux.CAP_SETGID,
+	"CAP_SETUID":           linux.CAP_SETUID,
+	"CAP_SETPCAP":          linux.CAP_SETPCAP,
+	"CAP_LINUX_IMMUTABLE":  linux.CAP_LINUX_IMMUTABLE,
+	"CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE,
+	"CAP_NET_BROADCAST":    linux.CAP_NET_BROADCAST,
+	"CAP_NET_ADMIN":        linux.CAP_NET_ADMIN,
+	"CAP_NET_RAW":          linux.CAP_NET_RAW,
+	"CAP_IPC_LOCK":         linux.CAP_IPC_LOCK,
+	"CAP_IPC_OWNER":        linux.CAP_IPC_OWNER,
+	"CAP_SYS_MODULE":       linux.CAP_SYS_MODULE,
+	"CAP_SYS_RAWIO":        linux.CAP_SYS_RAWIO,
+	"CAP_SYS_CHROOT":       linux.CAP_SYS_CHROOT,
+	"CAP_SYS_PTRACE":       linux.CAP_SYS_PTRACE,
+	"CAP_SYS_PACCT":        linux.CAP_SYS_PACCT,
+	"CAP_SYS_ADMIN":        linux.CAP_SYS_ADMIN,
+	"CAP_SYS_BOOT":         linux.CAP_SYS_BOOT,
+	"CAP_SYS_NICE":         linux.CAP_SYS_NICE,
+	"CAP_SYS_RESOURCE":     linux.CAP_SYS_RESOURCE,
+	"CAP_SYS_TIME":         linux.CAP_SYS_TIME,
+	"CAP_SYS_TTY_CONFIG":   linux.CAP_SYS_TTY_CONFIG,
+	"CAP_MKNOD":            linux.CAP_MKNOD,
+	"CAP_LEASE":            linux.CAP_LEASE,
+	"CAP_AUDIT_WRITE":      linux.CAP_AUDIT_WRITE,
+	"CAP_AUDIT_CONTROL":    linux.CAP_AUDIT_CONTROL,
+	"CAP_SETFCAP":          linux.CAP_SETFCAP,
+	"CAP_MAC_OVERRIDE":     linux.CAP_MAC_OVERRIDE,
+	"CAP_MAC_ADMIN":        linux.CAP_MAC_ADMIN,
+	"CAP_SYSLOG":           linux.CAP_SYSLOG,
+	"CAP_WAKE_ALARM":       linux.CAP_WAKE_ALARM,
+	"CAP_BLOCK_SUSPEND":    linux.CAP_BLOCK_SUSPEND,
+	"CAP_AUDIT_READ":       linux.CAP_AUDIT_READ,
+}
+
+func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) {
+	var caps []linux.Capability
+	for _, n := range names {
+		c, ok := capFromName[n]
+		if !ok {
+			return 0, fmt.Errorf("unknown capability %q", n)
+		}
+		// Should we skip this capabilty?
+		if _, ok := skipSet[c]; ok {
+			continue
+		}
+		caps = append(caps, c)
+	}
+	return auth.CapabilitySetOfMany(caps), nil
+}
+
+// Is9PMount returns true if the given mount can be mounted as an external gofer.
+func Is9PMount(m specs.Mount) bool {
+	var isBind bool
+	switch m.Type {
+	case "bind":
+		isBind = true
+	default:
+		for _, opt := range m.Options {
+			if opt == "bind" || opt == "rbind" {
+				isBind = true
+				break
+			}
+		}
+	}
+	return isBind && m.Source != "" && IsSupportedDevMount(m)
+}
+
+// IsSupportedDevMount returns true if the mount is a supported /dev mount.
+// Only mount that does not conflict with runsc default /dev mount is
+// supported.
+func IsSupportedDevMount(m specs.Mount) bool {
+	// These are devices exist inside sentry. See pkg/sentry/fs/dev/dev.go
+	var existingDevices = []string{
+		"/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr",
+		"/dev/null", "/dev/zero", "/dev/full", "/dev/random",
+		"/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx",
+	}
+	dst := filepath.Clean(m.Destination)
+	if dst == "/dev" {
+		// OCI spec uses many different mounts for the things inside of '/dev'. We
+		// have a single mount at '/dev' that is always mounted, regardless of
+		// whether it was asked for, as the spec says we SHOULD.
+		return false
+	}
+	for _, dev := range existingDevices {
+		if dst == dev || strings.HasPrefix(dst, dev+"/") {
+			return false
+		}
+	}
+	return true
+}
+
+// WaitForReady waits for a process to become ready. The process is ready when
+// the 'ready' function returns true. It continues to wait if 'ready' returns
+// false. It returns error on timeout, if the process stops or if 'ready' fails.
+func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error {
+	b := backoff.NewExponentialBackOff()
+	b.InitialInterval = 1 * time.Millisecond
+	b.MaxInterval = 1 * time.Second
+	b.MaxElapsedTime = timeout
+
+	op := func() error {
+		if ok, err := ready(); err != nil {
+			return backoff.Permanent(err)
+		} else if ok {
+			return nil
+		}
+
+		// Check if the process is still running.
+		// If the process is alive, child is 0 because of the NOHANG option.
+		// If the process has terminated, child equals the process id.
+		var ws syscall.WaitStatus
+		var ru syscall.Rusage
+		child, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, &ru)
+		if err != nil {
+			return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err))
+		} else if child == pid {
+			return backoff.Permanent(fmt.Errorf("process %d has terminated", pid))
+		}
+		return fmt.Errorf("process %d not running yet", pid)
+	}
+	return backoff.Retry(op, b)
+}
+
+// DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern'
+// ends with '/', it's used as a directory with default file name.
+// 'logPattern' can contain variables that are substituted:
+//   - %TIMESTAMP%: is replaced with a timestamp using the following format:
+//			<yyyymmdd-hhmmss.uuuuuu>
+//	 - %COMMAND%: is replaced with 'command'
+//	 - %TEST%: is replaced with 'test' (omitted by default)
+func DebugLogFile(logPattern, command, test string) (*os.File, error) {
+	if strings.HasSuffix(logPattern, "/") {
+		// Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
+		logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%"
+	}
+	logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1)
+	logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1)
+	logPattern = strings.Replace(logPattern, "%TEST%", test, -1)
+
+	dir := filepath.Dir(logPattern)
+	if err := os.MkdirAll(dir, 0775); err != nil {
+		return nil, fmt.Errorf("error creating dir %q: %v", dir, err)
+	}
+	return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
+}
+
+// Mount creates the mount point and calls Mount with the given flags.
+func Mount(src, dst, typ string, flags uint32) error {
+	// Create the mount point inside. The type must be the same as the
+	// source (file or directory).
+	var isDir bool
+	if typ == "proc" {
+		// Special case, as there is no source directory for proc mounts.
+		isDir = true
+	} else if fi, err := os.Stat(src); err != nil {
+		return fmt.Errorf("Stat(%q) failed: %v", src, err)
+	} else {
+		isDir = fi.IsDir()
+	}
+
+	if isDir {
+		// Create the destination directory.
+		if err := os.MkdirAll(dst, 0777); err != nil {
+			return fmt.Errorf("Mkdir(%q) failed: %v", dst, err)
+		}
+	} else {
+		// Create the parent destination directory.
+		parent := path.Dir(dst)
+		if err := os.MkdirAll(parent, 0777); err != nil {
+			return fmt.Errorf("Mkdir(%q) failed: %v", parent, err)
+		}
+		// Create the destination file if it does not exist.
+		f, err := os.OpenFile(dst, syscall.O_CREAT, 0777)
+		if err != nil {
+			return fmt.Errorf("Open(%q) failed: %v", dst, err)
+		}
+		f.Close()
+	}
+
+	// Do the mount.
+	if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil {
+		return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err)
+	}
+	return nil
+}
+
+// ContainsStr returns true if 'str' is inside 'strs'.
+func ContainsStr(strs []string, str string) bool {
+	for _, s := range strs {
+		if s == str {
+			return true
+		}
+	}
+	return false
+}
+
+// Cleanup allows defers to be aborted when cleanup needs to happen
+// conditionally. Usage:
+// c := MakeCleanup(func() { f.Close() })
+// defer c.Clean() // any failure before release is called will close the file.
+// ...
+// c.Release() // on success, aborts closing the file and return it.
+// return f
+type Cleanup struct {
+	clean func()
+}
+
+// MakeCleanup creates a new Cleanup object.
+func MakeCleanup(f func()) Cleanup {
+	return Cleanup{clean: f}
+}
+
+// Clean calls the cleanup function.
+func (c *Cleanup) Clean() {
+	if c.clean != nil {
+		c.clean()
+		c.clean = nil
+	}
+}
+
+// Release releases the cleanup from its duties, i.e. cleanup function is not
+// called after this point.
+func (c *Cleanup) Release() {
+	c.clean = nil
+}
+
+// RetryEintr retries the function until an error different than EINTR is
+// returned.
+func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
+	for {
+		r1, r2, err := f()
+		if err != syscall.EINTR {
+			return r1, r2, err
+		}
+	}
+}
+
+// GetOOMScoreAdj reads the given process' oom_score_adj
+func GetOOMScoreAdj(pid int) (int, error) {
+	data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid))
+	if err != nil {
+		return 0, err
+	}
+	return strconv.Atoi(strings.TrimSpace(string(data)))
+}
+
+// GetParentPid gets the parent process ID of the specified PID.
+func GetParentPid(pid int) (int, error) {
+	data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/stat", pid))
+	if err != nil {
+		return 0, err
+	}
+
+	var cpid string
+	var name string
+	var state string
+	var ppid int
+	// Parse after the binary name.
+	_, err = fmt.Sscanf(string(data),
+		"%v %v %v %d",
+		// cpid is ignored.
+		&cpid,
+		// name is ignored.
+		&name,
+		// state is ignored.
+		&state,
+		&ppid)
+
+	if err != nil {
+		return 0, err
+	}
+
+	return ppid, nil
+}
+
+// EnvVar looks for a varible value in the env slice assuming the following
+// format: "NAME=VALUE".
+func EnvVar(env []string, name string) (string, bool) {
+	prefix := name + "="
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			return strings.TrimPrefix(e, prefix), true
+		}
+	}
+	return "", false
+}
+
+// FaqErrorMsg returns an error message pointing to the FAQ.
+func FaqErrorMsg(anchor, msg string) string {
+	return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor)
+}
diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go
new file mode 100644
index 000000000..2c86fffe8
--- /dev/null
+++ b/runsc/specutils/specutils_test.go
@@ -0,0 +1,265 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestWaitForReadyHappy(t *testing.T) {
+	cmd := exec.Command("/bin/sleep", "1000")
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("cmd.Start() failed, err: %v", err)
+	}
+	defer cmd.Wait()
+
+	var count int
+	err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) {
+		if count < 3 {
+			count++
+			return false, nil
+		}
+		return true, nil
+	})
+	if err != nil {
+		t.Errorf("ProcessWaitReady got: %v, expected: nil", err)
+	}
+	cmd.Process.Kill()
+}
+
+func TestWaitForReadyFail(t *testing.T) {
+	cmd := exec.Command("/bin/sleep", "1000")
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("cmd.Start() failed, err: %v", err)
+	}
+	defer cmd.Wait()
+
+	var count int
+	err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) {
+		if count < 3 {
+			count++
+			return false, nil
+		}
+		return false, fmt.Errorf("Fake error")
+	})
+	if err == nil {
+		t.Errorf("ProcessWaitReady got: nil, expected: error")
+	}
+	cmd.Process.Kill()
+}
+
+func TestWaitForReadyNotRunning(t *testing.T) {
+	cmd := exec.Command("/bin/true")
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("cmd.Start() failed, err: %v", err)
+	}
+	defer cmd.Wait()
+
+	err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) {
+		return false, nil
+	})
+	if err != nil && !strings.Contains(err.Error(), "terminated") {
+		t.Errorf("ProcessWaitReady got: %v, expected: process terminated", err)
+	}
+	if err == nil {
+		t.Errorf("ProcessWaitReady incorrectly succeeded")
+	}
+}
+
+func TestWaitForReadyTimeout(t *testing.T) {
+	cmd := exec.Command("/bin/sleep", "1000")
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("cmd.Start() failed, err: %v", err)
+	}
+	defer cmd.Wait()
+
+	err := WaitForReady(cmd.Process.Pid, 50*time.Millisecond, func() (bool, error) {
+		return false, nil
+	})
+	if !strings.Contains(err.Error(), "not running yet") {
+		t.Errorf("ProcessWaitReady got: %v, expected: not running yet", err)
+	}
+	cmd.Process.Kill()
+}
+
+func TestSpecInvalid(t *testing.T) {
+	for _, test := range []struct {
+		name  string
+		spec  specs.Spec
+		error string
+	}{
+		{
+			name: "valid",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Mounts: []specs.Mount{
+					{
+						Source:      "src",
+						Destination: "/dst",
+					},
+				},
+			},
+			error: "",
+		},
+		{
+			name: "valid+warning",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+					// This is normally set by docker and will just cause warnings to be logged.
+					ApparmorProfile: "someprofile",
+				},
+				// This is normally set by docker and will just cause warnings to be logged.
+				Linux: &specs.Linux{Seccomp: &specs.LinuxSeccomp{}},
+			},
+			error: "",
+		},
+		{
+			name: "no root",
+			spec: specs.Spec{
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+			},
+			error: "must be defined",
+		},
+		{
+			name: "empty root",
+			spec: specs.Spec{
+				Root: &specs.Root{},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+			},
+			error: "must be defined",
+		},
+		{
+			name: "no process",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+			},
+			error: "must be defined",
+		},
+		{
+			name: "empty args",
+			spec: specs.Spec{
+				Root:    &specs.Root{Path: "/"},
+				Process: &specs.Process{},
+			},
+			error: "must be defined",
+		},
+		{
+			name: "selinux",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args:         []string{"/bin/true"},
+					SelinuxLabel: "somelabel",
+				},
+			},
+			error: "is not supported",
+		},
+		{
+			name: "solaris",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Solaris: &specs.Solaris{},
+			},
+			error: "is not supported",
+		},
+		{
+			name: "windows",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Windows: &specs.Windows{},
+			},
+			error: "is not supported",
+		},
+		{
+			name: "relative mount destination",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Mounts: []specs.Mount{
+					{
+						Source:      "src",
+						Destination: "dst",
+					},
+				},
+			},
+			error: "must be an absolute path",
+		},
+		{
+			name: "invalid mount option",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Mounts: []specs.Mount{
+					{
+						Source:      "/src",
+						Destination: "/dst",
+						Type:        "bind",
+						Options:     []string{"shared"},
+					},
+				},
+			},
+			error: "is not supported",
+		},
+		{
+			name: "invalid rootfs propagation",
+			spec: specs.Spec{
+				Root: &specs.Root{Path: "/"},
+				Process: &specs.Process{
+					Args: []string{"/bin/true"},
+				},
+				Linux: &specs.Linux{
+					RootfsPropagation: "foo",
+				},
+			},
+			error: "root mount propagation option must specify private or slave",
+		},
+	} {
+		err := ValidateSpec(&test.spec)
+		if len(test.error) == 0 {
+			if err != nil {
+				t.Errorf("ValidateSpec(%q) failed, err: %v", test.name, err)
+			}
+		} else {
+			if err == nil || !strings.Contains(err.Error(), test.error) {
+				t.Errorf("ValidateSpec(%q) wrong error, got: %v, want: .*%s.*", test.name, err, test.error)
+			}
+		}
+	}
+}
diff --git a/runsc/version.go b/runsc/version.go
new file mode 100644
index 000000000..ab9194b9d
--- /dev/null
+++ b/runsc/version.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+// version is set during linking.
+var version = "VERSION_MISSING"
diff --git a/runsc/version_test.sh b/runsc/version_test.sh
new file mode 100755
index 000000000..747350654
--- /dev/null
+++ b/runsc/version_test.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euf -x -o pipefail
+
+readonly runsc="$1"
+readonly version=$($runsc --version)
+
+# Version should should not match VERSION, which is the default and which will
+# also appear if something is wrong with workspace_status.sh script.
+if [[ $version =~ "VERSION" ]]; then
+  echo "FAIL: Got bad version $version"
+  exit 1
+fi
+
+# Version should contain at least one number.
+if [[ ! $version =~ [0-9] ]]; then
+  echo "FAIL: Got bad version $version"
+  exit 1
+fi
+
+echo "PASS: Got OK version $version"
+exit 0