30 files changed, 6781 insertions, 0 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
new file mode 100644
index 000000000..aad2a41de
--- /dev/null
+++ b/runsc/boot/BUILD
@@ -0,0 +1,137 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "boot",
+    srcs = [
+        "compat.go",
+        "compat_amd64.go",
+        "compat_arm64.go",
+        "config.go",
+        "controller.go",
+        "debug.go",
+        "events.go",
+        "fs.go",
+        "limits.go",
+        "loader.go",
+        "network.go",
+        "strace.go",
+        "vfs.go",
+    ],
+    visibility = [
+        "//pkg/test:__subpackages__",
+        "//runsc:__subpackages__",
+        "//test:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi",
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/control/server",
+        "//pkg/cpuid",
+        "//pkg/eventchannel",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/memutil",
+        "//pkg/rand",
+        "//pkg/refs",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/arch:registers_go_proto",
+        "//pkg/sentry/control",
+        "//pkg/sentry/devices/memdev",
+        "//pkg/sentry/devices/ttydev",
+        "//pkg/sentry/devices/tundev",
+        "//pkg/sentry/fdimport",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/dev",
+        "//pkg/sentry/fs/gofer",
+        "//pkg/sentry/fs/host",
+        "//pkg/sentry/fs/proc",
+        "//pkg/sentry/fs/ramfs",
+        "//pkg/sentry/fs/sys",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/fs/tty",
+        "//pkg/sentry/fs/user",
+        "//pkg/sentry/fsimpl/devpts",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/fuse",
+        "//pkg/sentry/fsimpl/gofer",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/fsimpl/overlay",
+        "//pkg/sentry/fsimpl/proc",
+        "//pkg/sentry/fsimpl/sys",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel:uncaught_signal_go_proto",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/sighandling",
+        "//pkg/sentry/socket/hostinet",
+        "//pkg/sentry/socket/netlink",
+        "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/netlink/uevent",
+        "//pkg/sentry/socket/netstack",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/state",
+        "//pkg/sentry/strace",
+        "//pkg/sentry/syscalls/linux/vfs2",
+        "//pkg/sentry/time",
+        "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/sentry/watchdog",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/qdisc/fifo",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/arp",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/raw",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/urpc",
+        "//runsc/boot/filter",
+        "//runsc/boot/platforms",
+        "//runsc/boot/pprof",
+        "//runsc/specutils",
+        "@com_github_golang_protobuf//proto:go_default_library",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "boot_test",
+    size = "small",
+    srcs = [
+        "compat_test.go",
+        "fs_test.go",
+        "loader_test.go",
+    ],
+    library = ":boot",
+    deps = [
+        "//pkg/control/server",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/unet",
+        "//runsc/fsgofer",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
new file mode 100644
index 000000000..84c67cbc2
--- /dev/null
+++ b/runsc/boot/compat.go
@@ -0,0 +1,202 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+
+	"github.com/golang/protobuf/proto"
+	"gvisor.dev/gvisor/pkg/eventchannel"
+	"gvisor.dev/gvisor/pkg/log"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+	spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func initCompatLogs(fd int) error {
+	ce, err := newCompatEmitter(fd)
+	if err != nil {
+		return err
+	}
+	eventchannel.AddEmitter(ce)
+	return nil
+}
+
+type compatEmitter struct {
+	sink    *log.BasicLogger
+	nameMap strace.SyscallMap
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// trackers map syscall number to the respective tracker instance.
+	// Protected by 'mu'.
+	trackers map[uint64]syscallTracker
+}
+
+func newCompatEmitter(logFD int) (*compatEmitter, error) {
+	nameMap, ok := getSyscallNameMap()
+	if !ok {
+		return nil, fmt.Errorf("Linux syscall table not found")
+	}
+
+	c := &compatEmitter{
+		// Always logs to default logger.
+		sink:     log.Log(),
+		nameMap:  nameMap,
+		trackers: make(map[uint64]syscallTracker),
+	}
+
+	if logFD > 0 {
+		f := os.NewFile(uintptr(logFD), "user log file")
+		target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}}
+		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
+	}
+	return c, nil
+}
+
+// Emit implements eventchannel.Emitter.
+func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
+	switch m := msg.(type) {
+	case *spb.UnimplementedSyscall:
+		c.emitUnimplementedSyscall(m)
+	case *ucspb.UncaughtSignal:
+		c.emitUncaughtSignal(m)
+	}
+
+	return false, nil
+}
+
+func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
+	regs := us.Registers
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	sysnr := syscallNum(regs)
+	tr := c.trackers[sysnr]
+	if tr == nil {
+		switch sysnr {
+		case syscall.SYS_PRCTL:
+			// args: cmd, ...
+			tr = newArgsTracker(0)
+
+		case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX, syscall.SYS_FALLOCATE:
+			// args: fd/addr, cmd, ...
+			tr = newArgsTracker(1)
+
+		case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT:
+			// args: fd, level, name, ...
+			tr = newArgsTracker(1, 2)
+
+		case syscall.SYS_SEMCTL:
+			// args: semid, semnum, cmd, ...
+			tr = newArgsTracker(2)
+
+		default:
+			tr = newArchArgsTracker(sysnr)
+			if tr == nil {
+				tr = &onceTracker{}
+			}
+		}
+		c.trackers[sysnr] = tr
+	}
+
+	if tr.shouldReport(regs) {
+		name := c.nameMap.Name(uintptr(sysnr))
+		c.sink.Infof("Unsupported syscall %s(%#x,%#x,%#x,%#x,%#x,%#x). It is "+
+			"likely that you can safely ignore this message and that this is not "+
+			"the cause of any error. Please, refer to %s/%s for more information.",
+			name, argVal(0, regs), argVal(1, regs), argVal(2, regs), argVal(3, regs),
+			argVal(4, regs), argVal(5, regs), syscallLink, name)
+
+		tr.onReported(regs)
+	}
+}
+
+func (c *compatEmitter) emitUncaughtSignal(msg *ucspb.UncaughtSignal) {
+	sig := syscall.Signal(msg.SignalNumber)
+	c.sink.Infof(
+		"Uncaught signal: %q (%d), PID: %d, TID: %d, fault addr: %#x",
+		sig, msg.SignalNumber, msg.Pid, msg.Tid, msg.FaultAddr)
+}
+
+// Close implements eventchannel.Emitter.
+func (c *compatEmitter) Close() error {
+	c.sink = nil
+	return nil
+}
+
+// syscallTracker interface allows filters to apply differently depending on
+// the syscall and arguments.
+type syscallTracker interface {
+	// shouldReport returns true is the syscall should be reported.
+	shouldReport(regs *rpb.Registers) bool
+
+	// onReported marks the syscall as reported.
+	onReported(regs *rpb.Registers)
+}
+
+// onceTracker reports only a single time, used for most syscalls.
+type onceTracker struct {
+	reported bool
+}
+
+func (o *onceTracker) shouldReport(_ *rpb.Registers) bool {
+	return !o.reported
+}
+
+func (o *onceTracker) onReported(_ *rpb.Registers) {
+	o.reported = true
+}
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+	// argsIdx is the syscall arguments to use as unique ID.
+	argsIdx  []int
+	reported map[string]struct{}
+	count    int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// key returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.Registers) string {
+	var rv string
+	for _, idx := range a.argsIdx {
+		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	}
+	return rv
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.Registers) bool {
+	if a.count >= reportLimit {
+		return false
+	}
+	_, ok := a.reported[a.key(regs)]
+	return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.Registers) {
+	a.count++
+	a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
new file mode 100644
index 000000000..8eb76b2ba
--- /dev/null
+++ b/runsc/boot/compat_amd64.go
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+const (
+	// reportLimit is the max number of events that should be reported per
+	// tracker.
+	reportLimit = 100
+	syscallLink = "https://gvisor.dev/c/linux/amd64"
+)
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Amd64{
+			Amd64: &rpb.AMD64Registers{},
+		},
+	}
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint64 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
+	switch argIdx {
+	case 0:
+		return amd64Regs.Rdi
+	case 1:
+		return amd64Regs.Rsi
+	case 2:
+		return amd64Regs.Rdx
+	case 3:
+		return amd64Regs.R10
+	case 4:
+		return amd64Regs.R8
+	case 5:
+		return amd64Regs.R9
+	}
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
+	switch argIdx {
+	case 0:
+		amd64Regs.Rdi = argVal
+	case 1:
+		amd64Regs.Rsi = argVal
+	case 2:
+		amd64Regs.Rdx = argVal
+	case 3:
+		amd64Regs.R10 = argVal
+	case 4:
+		amd64Regs.R8 = argVal
+	case 5:
+		amd64Regs.R9 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+	}
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.AMD64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+	return amd64Regs.OrigRax
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	switch sysnr {
+	case syscall.SYS_ARCH_PRCTL:
+		// args: cmd, ...
+		return newArgsTracker(0)
+	}
+	return nil
+}
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
new file mode 100644
index 000000000..bce9d95b3
--- /dev/null
+++ b/runsc/boot/compat_arm64.go
@@ -0,0 +1,95 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+const (
+	// reportLimit is the max number of events that should be reported per
+	// tracker.
+	reportLimit = 100
+	syscallLink = "https://gvisor.dev/c/linux/arm64"
+)
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Arm64{
+			Arm64: &rpb.ARM64Registers{},
+		},
+	}
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint64 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		return arm64Regs.R0
+	case 1:
+		return arm64Regs.R1
+	case 2:
+		return arm64Regs.R2
+	case 3:
+		return arm64Regs.R3
+	case 4:
+		return arm64Regs.R4
+	case 5:
+		return arm64Regs.R5
+	}
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		arm64Regs.R0 = argVal
+	case 1:
+		arm64Regs.R1 = argVal
+	case 2:
+		arm64Regs.R2 = argVal
+	case 3:
+		arm64Regs.R3 = argVal
+	case 4:
+		arm64Regs.R4 = argVal
+	case 5:
+		arm64Regs.R5 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+	}
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.ARM64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+	return arm64Regs.R8
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	// currently, no arch specific syscalls need to be handled here.
+	return nil
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
new file mode 100644
index 000000000..839c5303b
--- /dev/null
+++ b/runsc/boot/compat_test.go
@@ -0,0 +1,90 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"testing"
+)
+
+func TestOnceTracker(t *testing.T) {
+	o := onceTracker{}
+	if !o.shouldReport(nil) {
+		t.Error("first call to checkAndMark, got: false, want: true")
+	}
+	o.onReported(nil)
+	for i := 0; i < 2; i++ {
+		if o.shouldReport(nil) {
+			t.Error("after first call to checkAndMark, got: true, want: false")
+		}
+	}
+}
+
+func TestArgsTracker(t *testing.T) {
+	for _, tc := range []struct {
+		name   string
+		idx    []int
+		arg1_1 uint64
+		arg1_2 uint64
+		arg2_1 uint64
+		arg2_2 uint64
+		want   bool
+	}{
+		{name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false},
+		{name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false},
+		{name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true},
+		{name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true},
+		{name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false},
+		{name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false},
+		{name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := newArgsTracker(tc.idx...)
+			regs := newRegs()
+			setArgVal(0, tc.arg1_1, regs)
+			setArgVal(1, tc.arg2_1, regs)
+			if !c.shouldReport(regs) {
+				t.Error("first call to shouldReport, got: false, want: true")
+			}
+			c.onReported(regs)
+
+			setArgVal(0, tc.arg1_2, regs)
+			setArgVal(1, tc.arg2_2, regs)
+			if got := c.shouldReport(regs); tc.want != got {
+				t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestArgsTrackerLimit(t *testing.T) {
+	c := newArgsTracker(0, 1)
+	for i := 0; i < reportLimit; i++ {
+		regs := newRegs()
+		setArgVal(0, 123, regs)
+		setArgVal(1, uint64(i), regs)
+		if !c.shouldReport(regs) {
+			t.Error("shouldReport before limit was reached, got: false, want: true")
+		}
+		c.onReported(regs)
+	}
+
+	// Should hit the count limit now.
+	regs := newRegs()
+	setArgVal(0, 123, regs)
+	setArgVal(1, 123456, regs)
+	if c.shouldReport(regs) {
+		t.Error("shouldReport after limit was reached, got: true, want: false")
+	}
+}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
new file mode 100644
index 000000000..bb01b8fb5
--- /dev/null
+++ b/runsc/boot/config.go
@@ -0,0 +1,329 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+)
+
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+	// FileAccessShared sends IO requests to a Gofer process that validates the
+	// requests and forwards them to the host.
+	FileAccessShared FileAccessType = iota
+
+	// FileAccessExclusive is the same as FileAccessShared, but enables
+	// extra caching for improved performance. It should only be used if
+	// the sandbox has exclusive access to the filesystem.
+	FileAccessExclusive
+)
+
+// MakeFileAccessType converts type from string.
+func MakeFileAccessType(s string) (FileAccessType, error) {
+	switch s {
+	case "shared":
+		return FileAccessShared, nil
+	case "exclusive":
+		return FileAccessExclusive, nil
+	default:
+		return 0, fmt.Errorf("invalid file access type %q", s)
+	}
+}
+
+func (f FileAccessType) String() string {
+	switch f {
+	case FileAccessShared:
+		return "shared"
+	case FileAccessExclusive:
+		return "exclusive"
+	default:
+		return fmt.Sprintf("unknown(%d)", f)
+	}
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+	// NetworkSandbox uses internal network stack, isolated from the host.
+	NetworkSandbox NetworkType = iota
+
+	// NetworkHost redirects network related syscalls to the host network.
+	NetworkHost
+
+	// NetworkNone sets up just loopback using netstack.
+	NetworkNone
+)
+
+// MakeNetworkType converts type from string.
+func MakeNetworkType(s string) (NetworkType, error) {
+	switch s {
+	case "sandbox":
+		return NetworkSandbox, nil
+	case "host":
+		return NetworkHost, nil
+	case "none":
+		return NetworkNone, nil
+	default:
+		return 0, fmt.Errorf("invalid network type %q", s)
+	}
+}
+
+func (n NetworkType) String() string {
+	switch n {
+	case NetworkSandbox:
+		return "sandbox"
+	case NetworkHost:
+		return "host"
+	case NetworkNone:
+		return "none"
+	default:
+		return fmt.Sprintf("unknown(%d)", n)
+	}
+}
+
+// MakeWatchdogAction converts type from string.
+func MakeWatchdogAction(s string) (watchdog.Action, error) {
+	switch strings.ToLower(s) {
+	case "log", "logwarning":
+		return watchdog.LogWarning, nil
+	case "panic":
+		return watchdog.Panic, nil
+	default:
+		return 0, fmt.Errorf("invalid watchdog action %q", s)
+	}
+}
+
+// MakeRefsLeakMode converts type from string.
+func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
+	switch strings.ToLower(s) {
+	case "disabled":
+		return refs.NoLeakChecking, nil
+	case "log-names":
+		return refs.LeaksLogWarning, nil
+	case "log-traces":
+		return refs.LeaksLogTraces, nil
+	default:
+		return 0, fmt.Errorf("invalid refs leakmode %q", s)
+	}
+}
+
+func refsLeakModeToString(mode refs.LeakMode) string {
+	switch mode {
+	// If not set, default it to disabled.
+	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
+		return "disabled"
+	case refs.LeaksLogWarning:
+		return "log-names"
+	case refs.LeaksLogTraces:
+		return "log-traces"
+	default:
+		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
+	}
+}
+
+// Config holds configuration that is not part of the runtime spec.
+type Config struct {
+	// RootDir is the runtime root directory.
+	RootDir string
+
+	// Debug indicates that debug logging should be enabled.
+	Debug bool
+
+	// LogFilename is the filename to log to, if not empty.
+	LogFilename string
+
+	// LogFormat is the log format.
+	LogFormat string
+
+	// DebugLog is the path to log debug information to, if not empty.
+	DebugLog string
+
+	// PanicLog is the path to log GO's runtime messages, if not empty.
+	PanicLog string
+
+	// DebugLogFormat is the log format for debug.
+	DebugLogFormat string
+
+	// FileAccess indicates how the filesystem is accessed.
+	FileAccess FileAccessType
+
+	// Overlay is whether to wrap the root filesystem in an overlay.
+	Overlay bool
+
+	// FSGoferHostUDS enables the gofer to mount a host UDS.
+	FSGoferHostUDS bool
+
+	// Network indicates what type of network to use.
+	Network NetworkType
+
+	// EnableRaw indicates whether raw sockets should be enabled. Raw
+	// sockets are disabled by stripping CAP_NET_RAW from the list of
+	// capabilities.
+	EnableRaw bool
+
+	// HardwareGSO indicates that hardware segmentation offload is enabled.
+	HardwareGSO bool
+
+	// SoftwareGSO indicates that software segmentation offload is enabled.
+	SoftwareGSO bool
+
+	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
+	TXChecksumOffload bool
+
+	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
+	RXChecksumOffload bool
+
+	// QDisc indicates the type of queuening discipline to use by default
+	// for non-loopback interfaces.
+	QDisc QueueingDiscipline
+
+	// LogPackets indicates that all network packets should be logged.
+	LogPackets bool
+
+	// Platform is the platform to run on.
+	Platform string
+
+	// Strace indicates that strace should be enabled.
+	Strace bool
+
+	// StraceSyscalls is the set of syscalls to trace.  If StraceEnable is
+	// true and this list is empty, then all syscalls will be traced.
+	StraceSyscalls []string
+
+	// StraceLogSize is the max size of data blobs to display.
+	StraceLogSize uint
+
+	// DisableSeccomp indicates whether seccomp syscall filters should be
+	// disabled. Pardon the double negation, but default to enabled is important.
+	DisableSeccomp bool
+
+	// WatchdogAction sets what action the watchdog takes when triggered.
+	WatchdogAction watchdog.Action
+
+	// PanicSignal registers signal handling that panics. Usually set to
+	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
+	PanicSignal int
+
+	// ProfileEnable is set to prepare the sandbox to be profiled.
+	ProfileEnable bool
+
+	// RestoreFile is the path to the saved container image
+	RestoreFile string
+
+	// NumNetworkChannels controls the number of AF_PACKET sockets that map
+	// to the same underlying network device. This allows netstack to better
+	// scale for high throughput use cases.
+	NumNetworkChannels int
+
+	// Rootless allows the sandbox to be started with a user that is not root.
+	// Defense is depth measures are weaker with rootless. Specifically, the
+	// sandbox and Gofer process run as root inside a user namespace with root
+	// mapped to the caller's user.
+	Rootless bool
+
+	// AlsoLogToStderr allows to send log messages to stderr.
+	AlsoLogToStderr bool
+
+	// ReferenceLeakMode sets reference leak check mode
+	ReferenceLeakMode refs.LeakMode
+
+	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
+	// is on a Linux overlayfs mount, which does not necessarily preserve
+	// coherence between read-only and subsequent writable file descriptors
+	// representing the "same" file.
+	OverlayfsStaleRead bool
+
+	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+	// tests. It allows runsc to start the sandbox process as the current
+	// user, and without chrooting the sandbox process. This can be
+	// necessary in test environments that have limited capabilities.
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
+	// test name in the container environment variables and adds it to the debug
+	// log file name. This is done to help identify the log with the test when
+	// multiple tests are run in parallel, since there is no way to pass
+	// parameters to the runtime from docker.
+	TestOnlyTestNameEnv string
+
+	// CPUNumFromQuota sets CPU number count to available CPU quota, using
+	// least integer value greater than or equal to quota.
+	//
+	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+	CPUNumFromQuota bool
+
+	// Enables VFS2 (not plumbled through yet).
+	VFS2 bool
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+	f := []string{
+		"--root=" + c.RootDir,
+		"--debug=" + strconv.FormatBool(c.Debug),
+		"--log=" + c.LogFilename,
+		"--log-format=" + c.LogFormat,
+		"--debug-log=" + c.DebugLog,
+		"--panic-log=" + c.PanicLog,
+		"--debug-log-format=" + c.DebugLogFormat,
+		"--file-access=" + c.FileAccess.String(),
+		"--overlay=" + strconv.FormatBool(c.Overlay),
+		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
+		"--network=" + c.Network.String(),
+		"--log-packets=" + strconv.FormatBool(c.LogPackets),
+		"--platform=" + c.Platform,
+		"--strace=" + strconv.FormatBool(c.Strace),
+		"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
+		"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
+		"--watchdog-action=" + c.WatchdogAction.String(),
+		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
+		"--profile=" + strconv.FormatBool(c.ProfileEnable),
+		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
+		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
+		"--rootless=" + strconv.FormatBool(c.Rootless),
+		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
+		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
+		"--gso=" + strconv.FormatBool(c.HardwareGSO),
+		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
+		"--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
+		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
+		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
+		"--qdisc=" + c.QDisc.String(),
+	}
+	if c.CPUNumFromQuota {
+		f = append(f, "--cpu-num-from-quota")
+	}
+	// Only include these if set since it is never to be used by users.
+	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
+		f = append(f, "--TESTONLY-unsafe-nonroot=true")
+	}
+	if len(c.TestOnlyTestNameEnv) != 0 {
+		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
+	}
+
+	if c.VFS2 {
+		f = append(f, "--vfs2=true")
+	}
+
+	return f
+}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
new file mode 100644
index 000000000..8125d5061
--- /dev/null
+++ b/runsc/boot/controller.go
@@ -0,0 +1,506 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	"gvisor.dev/gvisor/pkg/sentry/state"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+const (
+	// ContainerCheckpoint checkpoints a container.
+	ContainerCheckpoint = "containerManager.Checkpoint"
+
+	// ContainerCreate creates a container.
+	ContainerCreate = "containerManager.Create"
+
+	// ContainerDestroy is used to stop a non-root container and free all
+	// associated resources in the sandbox.
+	ContainerDestroy = "containerManager.Destroy"
+
+	// ContainerEvent is the URPC endpoint for getting stats about the
+	// container used by "runsc events".
+	ContainerEvent = "containerManager.Event"
+
+	// ContainerExecuteAsync is the URPC endpoint for executing a command in a
+	// container.
+	ContainerExecuteAsync = "containerManager.ExecuteAsync"
+
+	// ContainerPause pauses the container.
+	ContainerPause = "containerManager.Pause"
+
+	// ContainerProcesses is the URPC endpoint for getting the list of
+	// processes running in a container.
+	ContainerProcesses = "containerManager.Processes"
+
+	// ContainerRestore restores a container from a statefile.
+	ContainerRestore = "containerManager.Restore"
+
+	// ContainerResume unpauses the paused container.
+	ContainerResume = "containerManager.Resume"
+
+	// ContainerSignal is used to send a signal to a container.
+	ContainerSignal = "containerManager.Signal"
+
+	// ContainerSignalProcess is used to send a signal to a particular
+	// process in a container.
+	ContainerSignalProcess = "containerManager.SignalProcess"
+
+	// ContainerStart is the URPC endpoint for running a non-root container
+	// within a sandbox.
+	ContainerStart = "containerManager.Start"
+
+	// ContainerWait is used to wait on the init process of the container
+	// and return its ExitStatus.
+	ContainerWait = "containerManager.Wait"
+
+	// ContainerWaitPID is used to wait on a process with a certain PID in
+	// the sandbox and return its ExitStatus.
+	ContainerWaitPID = "containerManager.WaitPID"
+
+	// NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
+	// and routes in a network stack.
+	NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
+
+	// RootContainerStart is the URPC endpoint for starting a new sandbox
+	// with root container.
+	RootContainerStart = "containerManager.StartRoot"
+
+	// SandboxStacks collects sandbox stacks for debugging.
+	SandboxStacks = "debug.Stacks"
+)
+
+// Profiling related commands (see pprof.go for more details).
+const (
+	StartCPUProfile  = "Profile.StartCPUProfile"
+	StopCPUProfile   = "Profile.StopCPUProfile"
+	HeapProfile      = "Profile.HeapProfile"
+	GoroutineProfile = "Profile.GoroutineProfile"
+	BlockProfile     = "Profile.BlockProfile"
+	MutexProfile     = "Profile.MutexProfile"
+	StartTrace       = "Profile.StartTrace"
+	StopTrace        = "Profile.StopTrace"
+)
+
+// Logging related commands (see logging.go for more details).
+const (
+	ChangeLogging = "Logging.Change"
+)
+
+// ControlSocketAddr generates an abstract unix socket name for the given ID.
+func ControlSocketAddr(id string) string {
+	return fmt.Sprintf("\x00runsc-sandbox.%s", id)
+}
+
+// controller holds the control server, and is used for communication into the
+// sandbox.
+type controller struct {
+	// srv is the control server.
+	srv *server.Server
+
+	// manager holds the containerManager methods.
+	manager *containerManager
+}
+
+// newController creates a new controller. The caller must call
+// controller.srv.StartServing() to start the controller.
+func newController(fd int, l *Loader) (*controller, error) {
+	srv, err := server.CreateFromFD(fd)
+	if err != nil {
+		return nil, err
+	}
+
+	manager := &containerManager{
+		startChan:       make(chan struct{}),
+		startResultChan: make(chan error),
+		l:               l,
+	}
+	srv.Register(manager)
+
+	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
+		net := &Network{
+			Stack: eps.Stack,
+		}
+		srv.Register(net)
+	}
+
+	srv.Register(&debug{})
+	srv.Register(&control.Logging{})
+	if l.conf.ProfileEnable {
+		srv.Register(&control.Profile{
+			Kernel: l.k,
+		})
+	}
+
+	return &controller{
+		srv:     srv,
+		manager: manager,
+	}, nil
+}
+
+// containerManager manages sandbox containers.
+type containerManager struct {
+	// startChan is used to signal when the root container process should
+	// be started.
+	startChan chan struct{}
+
+	// startResultChan is used to signal when the root container  has
+	// started. Any errors encountered during startup will be sent to the
+	// channel. A nil value indicates success.
+	startResultChan chan error
+
+	// l is the loader that creates containers and sandboxes.
+	l *Loader
+}
+
+// StartRoot will start the root container process.
+func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
+	log.Debugf("containerManager.StartRoot %q", *cid)
+	// Tell the root container to start and wait for the result.
+	cm.startChan <- struct{}{}
+	if err := <-cm.startResultChan; err != nil {
+		return fmt.Errorf("starting sandbox: %v", err)
+	}
+	return nil
+}
+
+// Processes retrieves information about processes running in the sandbox.
+func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error {
+	log.Debugf("containerManager.Processes: %q", *cid)
+	return control.Processes(cm.l.k, *cid, out)
+}
+
+// Create creates a container within a sandbox.
+func (cm *containerManager) Create(cid *string, _ *struct{}) error {
+	log.Debugf("containerManager.Create: %q", *cid)
+	return cm.l.createContainer(*cid)
+}
+
+// StartArgs contains arguments to the Start method.
+type StartArgs struct {
+	// Spec is the spec of the container to start.
+	Spec *specs.Spec
+
+	// Config is the runsc-specific configuration for the sandbox.
+	Conf *Config
+
+	// CID is the ID of the container to start.
+	CID string
+
+	// FilePayload contains, in order:
+	//   * stdin, stdout, and stderr.
+	//   * the file descriptor over which the sandbox will
+	//     request files from its root filesystem.
+	urpc.FilePayload
+}
+
+// Start runs a created container within a sandbox.
+func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Start: %+v", args)
+
+	// Validate arguments.
+	if args == nil {
+		return errors.New("start missing arguments")
+	}
+	if args.Spec == nil {
+		return errors.New("start arguments missing spec")
+	}
+	if args.Conf == nil {
+		return errors.New("start arguments missing config")
+	}
+	if args.CID == "" {
+		return errors.New("start argument missing container ID")
+	}
+	if len(args.FilePayload.Files) < 4 {
+		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
+	}
+
+	// All validation passed, logs the spec for debugging.
+	specutils.LogSpec(args.Spec)
+
+	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	if err != nil {
+		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
+		return err
+	}
+	log.Debugf("Container %q started", args.CID)
+
+	return nil
+}
+
+// Destroy stops a container if it is still running and cleans up its
+// filesystem.
+func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
+	log.Debugf("containerManager.destroy %q", *cid)
+	return cm.l.destroyContainer(*cid)
+}
+
+// ExecuteAsync starts running a command on a created or running sandbox. It
+// returns the PID of the new process.
+func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
+	log.Debugf("containerManager.ExecuteAsync: %+v", args)
+	tgid, err := cm.l.executeAsync(args)
+	if err != nil {
+		log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err)
+		return err
+	}
+	*pid = int32(tgid)
+	return nil
+}
+
+// Checkpoint pauses a sandbox and saves its state.
+func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
+	log.Debugf("containerManager.Checkpoint")
+	state := control.State{
+		Kernel:   cm.l.k,
+		Watchdog: cm.l.watchdog,
+	}
+	return state.Save(o, nil)
+}
+
+// Pause suspends a container.
+func (cm *containerManager) Pause(_, _ *struct{}) error {
+	log.Debugf("containerManager.Pause")
+	cm.l.k.Pause()
+	return nil
+}
+
+// RestoreOpts contains options related to restoring a container's file system.
+type RestoreOpts struct {
+	// FilePayload contains the state file to be restored, followed by the
+	// platform device file if necessary.
+	urpc.FilePayload
+
+	// SandboxID contains the ID of the sandbox.
+	SandboxID string
+}
+
+// Restore loads a container from a statefile.
+// The container's current kernel is destroyed, a restore environment is
+// created, and the kernel is recreated with the restore state file. The
+// container then sends the signal to start.
+func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
+	log.Debugf("containerManager.Restore")
+
+	var specFile, deviceFile *os.File
+	switch numFiles := len(o.FilePayload.Files); numFiles {
+	case 2:
+		// The device file is donated to the platform.
+		// Can't take ownership away from os.File. dup them to get a new FD.
+		fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+		if err != nil {
+			return fmt.Errorf("failed to dup file: %v", err)
+		}
+		deviceFile = os.NewFile(uintptr(fd), "platform device")
+		fallthrough
+	case 1:
+		specFile = o.FilePayload.Files[0]
+	case 0:
+		return fmt.Errorf("at least one file must be passed to Restore")
+	default:
+		return fmt.Errorf("at most two files may be passed to Restore")
+	}
+
+	// Pause the kernel while we build a new one.
+	cm.l.k.Pause()
+
+	p, err := createPlatform(cm.l.conf, deviceFile)
+	if err != nil {
+		return fmt.Errorf("creating platform: %v", err)
+	}
+	k := &kernel.Kernel{
+		Platform: p,
+	}
+	mf, err := createMemoryFile()
+	if err != nil {
+		return fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
+	networkStack := cm.l.k.RootNetworkNamespace().Stack()
+	cm.l.k = k
+
+	// Set up the restore environment.
+	mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
+	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
+	if err != nil {
+		return fmt.Errorf("creating RestoreEnvironment: %v", err)
+	}
+	fs.SetRestoreEnvironment(*renv)
+
+	// Prepare to load from the state file.
+	if eps, ok := networkStack.(*netstack.Stack); ok {
+		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
+	}
+	info, err := specFile.Stat()
+	if err != nil {
+		return err
+	}
+	if info.Size() == 0 {
+		return fmt.Errorf("file cannot be empty")
+	}
+
+	if cm.l.conf.ProfileEnable {
+		// pprof.Initialize opens /proc/self/maps, so has to be called before
+		// installing seccomp filters.
+		pprof.Initialize()
+	}
+
+	// Seccomp filters have to be applied before parsing the state file.
+	if err := cm.l.installSeccompFilters(); err != nil {
+		return err
+	}
+
+	// Load the state.
+	loadOpts := state.LoadOpts{Source: specFile}
+	if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil {
+		return err
+	}
+
+	// Since we have a new kernel we also must make a new watchdog.
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
+
+	// Change the loader fields to reflect the changes made when restoring.
+	cm.l.k = k
+	cm.l.watchdog = dog
+	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+	cm.l.restore = true
+
+	// Reinitialize the sandbox ID and processes map. Note that it doesn't
+	// restore the state of multiple containers, nor exec processes.
+	cm.l.sandboxID = o.SandboxID
+	cm.l.mu.Lock()
+	eid := execID{cid: o.SandboxID}
+	cm.l.processes = map[execID]*execProcess{
+		eid: {
+			tg: cm.l.k.GlobalInit(),
+		},
+	}
+	cm.l.mu.Unlock()
+
+	// Tell the root container to start and wait for the result.
+	cm.startChan <- struct{}{}
+	if err := <-cm.startResultChan; err != nil {
+		return fmt.Errorf("starting sandbox: %v", err)
+	}
+
+	return nil
+}
+
+// Resume unpauses a container.
+func (cm *containerManager) Resume(_, _ *struct{}) error {
+	log.Debugf("containerManager.Resume")
+	cm.l.k.Unpause()
+	return nil
+}
+
+// Wait waits for the init process in the given container.
+func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
+	log.Debugf("containerManager.Wait")
+	err := cm.l.waitContainer(*cid, waitStatus)
+	log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err)
+	return err
+}
+
+// WaitPIDArgs are arguments to the WaitPID method.
+type WaitPIDArgs struct {
+	// PID is the PID in the container's PID namespace.
+	PID int32
+
+	// CID is the container ID.
+	CID string
+}
+
+// WaitPID waits for the process with PID 'pid' in the sandbox.
+func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
+	log.Debugf("containerManager.Wait")
+	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
+}
+
+// SignalDeliveryMode enumerates different signal delivery modes.
+type SignalDeliveryMode int
+
+const (
+	// DeliverToProcess delivers the signal to the container process with
+	// the specified PID. If PID is 0, then the container init process is
+	// signaled.
+	DeliverToProcess SignalDeliveryMode = iota
+
+	// DeliverToAllProcesses delivers the signal to all processes in the
+	// container. PID must be 0.
+	DeliverToAllProcesses
+
+	// DeliverToForegroundProcessGroup delivers the signal to the
+	// foreground process group in the same TTY session as the specified
+	// process. If PID is 0, then the signal is delivered to the foreground
+	// process group for the TTY for the init process.
+	DeliverToForegroundProcessGroup
+)
+
+func (s SignalDeliveryMode) String() string {
+	switch s {
+	case DeliverToProcess:
+		return "Process"
+	case DeliverToAllProcesses:
+		return "All"
+	case DeliverToForegroundProcessGroup:
+		return "Foreground Process Group"
+	}
+	return fmt.Sprintf("unknown signal delivery mode: %d", s)
+}
+
+// SignalArgs are arguments to the Signal method.
+type SignalArgs struct {
+	// CID is the container ID.
+	CID string
+
+	// Signo is the signal to send to the process.
+	Signo int32
+
+	// PID is the process ID in the given container that will be signaled.
+	// If 0, the root container will be signalled.
+	PID int32
+
+	// Mode is the signal delivery mode.
+	Mode SignalDeliveryMode
+}
+
+// Signal sends a signal to one or more processes in a container. If args.PID
+// is 0, then the container init process is used. Depending on the
+// args.SignalDeliveryMode option, the signal may be sent directly to the
+// indicated process, to all processes in the container, or to the foreground
+// process group.
+func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Signal %+v", args)
+	return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
+}
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
new file mode 100644
index 000000000..1fb32c527
--- /dev/null
+++ b/runsc/boot/debug.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+type debug struct {
+}
+
+// Stacks collects all sandbox stacks and copies them to 'stacks'.
+func (*debug) Stacks(_ *struct{}, stacks *string) error {
+	buf := log.Stacks(true)
+	*stacks = string(buf)
+	return nil
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
new file mode 100644
index 000000000..422f4da00
--- /dev/null
+++ b/runsc/boot/events.go
@@ -0,0 +1,81 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+)
+
+// Event struct for encoding the event data to JSON. Corresponds to runc's
+// main.event struct.
+type Event struct {
+	Type string      `json:"type"`
+	ID   string      `json:"id"`
+	Data interface{} `json:"data,omitempty"`
+}
+
+// Stats is the runc specific stats structure for stability when encoding and
+// decoding stats.
+type Stats struct {
+	Memory Memory `json:"memory"`
+	Pids   Pids   `json:"pids"`
+}
+
+// Pids contains stats on processes.
+type Pids struct {
+	Current uint64 `json:"current,omitempty"`
+	Limit   uint64 `json:"limit,omitempty"`
+}
+
+// MemoryEntry contains stats on a kind of memory.
+type MemoryEntry struct {
+	Limit   uint64 `json:"limit"`
+	Usage   uint64 `json:"usage,omitempty"`
+	Max     uint64 `json:"max,omitempty"`
+	Failcnt uint64 `json:"failcnt"`
+}
+
+// Memory contains stats on memory.
+type Memory struct {
+	Cache     uint64            `json:"cache,omitempty"`
+	Usage     MemoryEntry       `json:"usage,omitempty"`
+	Swap      MemoryEntry       `json:"swap,omitempty"`
+	Kernel    MemoryEntry       `json:"kernel,omitempty"`
+	KernelTCP MemoryEntry       `json:"kernelTCP,omitempty"`
+	Raw       map[string]uint64 `json:"raw,omitempty"`
+}
+
+// Event gets the events from the container.
+func (cm *containerManager) Event(_ *struct{}, out *Event) error {
+	stats := &Stats{}
+	stats.populateMemory(cm.l.k)
+	stats.populatePIDs(cm.l.k)
+	*out = Event{Type: "stats", Data: stats}
+	return nil
+}
+
+func (s *Stats) populateMemory(k *kernel.Kernel) {
+	mem := k.MemoryFile()
+	mem.UpdateUsage()
+	_, totalUsage := usage.MemoryAccounting.Copy()
+	s.Memory.Usage = MemoryEntry{
+		Usage: totalUsage,
+	}
+}
+
+func (s *Stats) populatePIDs(k *kernel.Kernel) {
+	s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups()))
+}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
new file mode 100644
index 000000000..ed18f0047
--- /dev/null
+++ b/runsc/boot/filter/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "filter",
+    srcs = [
+        "config.go",
+        "config_amd64.go",
+        "config_arm64.go",
+        "config_profile.go",
+        "extra_filters.go",
+        "extra_filters_msan.go",
+        "extra_filters_race.go",
+        "filter.go",
+    ],
+    visibility = [
+        "//runsc/boot:__subpackages__",
+    ],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/log",
+        "//pkg/seccomp",
+        "//pkg/sentry/platform",
+        "//pkg/tcpip/link/fdbased",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
new file mode 100644
index 000000000..60e33425f
--- /dev/null
+++ b/runsc/boot/filter/config.go
@@ -0,0 +1,559 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"os"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+)
+
+// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
+var allowedSyscalls = seccomp.SyscallRules{
+	syscall.SYS_CLOCK_GETTIME: {},
+	syscall.SYS_CLONE: []seccomp.Rule{
+		{
+			seccomp.AllowValue(
+				syscall.CLONE_VM |
+					syscall.CLONE_FS |
+					syscall.CLONE_FILES |
+					syscall.CLONE_SIGHAND |
+					syscall.CLONE_SYSVSEM |
+					syscall.CLONE_THREAD),
+		},
+	},
+	syscall.SYS_CLOSE: {},
+	syscall.SYS_DUP:   {},
+	syscall.SYS_DUP3: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.O_CLOEXEC),
+		},
+	},
+	syscall.SYS_EPOLL_CREATE1: {},
+	syscall.SYS_EPOLL_CTL:     {},
+	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EVENTFD2: []seccomp.Rule{
+		{
+			seccomp.AllowValue(0),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_EXIT:       {},
+	syscall.SYS_EXIT_GROUP: {},
+	syscall.SYS_FALLOCATE:  {},
+	syscall.SYS_FCHMOD:     {},
+	syscall.SYS_FCNTL: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFL),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_SETFL),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.F_GETFD),
+		},
+	},
+	syscall.SYS_FSTAT:     {},
+	syscall.SYS_FSYNC:     {},
+	syscall.SYS_FTRUNCATE: {},
+	syscall.SYS_FUTEX: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+			seccomp.AllowAny{},
+		},
+		// Non-private variants are included for flipcall support. They are otherwise
+		// unncessary, as the sentry will use only private futexes internally.
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE),
+			seccomp.AllowAny{},
+		},
+	},
+	syscall.SYS_GETPID: {},
+	unix.SYS_GETRANDOM: {},
+	syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_DOMAIN),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_TYPE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_ERROR),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.SOL_SOCKET),
+			seccomp.AllowValue(syscall.SO_SNDBUF),
+		},
+	},
+	syscall.SYS_GETTID:       {},
+	syscall.SYS_GETTIMEOFDAY: {},
+	// SYS_IOCTL is needed for terminal support, but we only allow
+	// setting/getting termios and winsize.
+	syscall.SYS_IOCTL: []seccomp.Rule{
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCGETS),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETS),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETSF),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TCSETSW),
+			seccomp.AllowAny{}, /* termios struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TIOCSWINSZ),
+			seccomp.AllowAny{}, /* winsize struct */
+		},
+		{
+			seccomp.AllowAny{}, /* fd */
+			seccomp.AllowValue(linux.TIOCGWINSZ),
+			seccomp.AllowAny{}, /* winsize struct */
+		},
+	},
+	syscall.SYS_LSEEK:   {},
+	syscall.SYS_MADVISE: {},
+	syscall.SYS_MINCORE: {},
+	// Used by the Go runtime as a temporarily workaround for a Linux
+	// 5.2-5.4 bug.
+	//
+	// See src/runtime/os_linux_x86.go.
+	//
+	// TODO(b/148688965): Remove once this is gone from Go.
+	syscall.SYS_MLOCK: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(4096),
+		},
+	},
+	syscall.SYS_MMAP: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_SHARED),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
+			seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+		},
+	},
+	syscall.SYS_MPROTECT:  {},
+	syscall.SYS_MUNMAP:    {},
+	syscall.SYS_NANOSLEEP: {},
+	syscall.SYS_PPOLL:     {},
+	syscall.SYS_PREAD64:   {},
+	syscall.SYS_PREADV:    {},
+	unix.SYS_PREADV2:      {},
+	syscall.SYS_PWRITE64:  {},
+	syscall.SYS_PWRITEV:   {},
+	unix.SYS_PWRITEV2:     {},
+	syscall.SYS_READ:      {},
+	syscall.SYS_RECVMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+		},
+	},
+	syscall.SYS_RECVMMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
+			seccomp.AllowValue(syscall.MSG_DONTWAIT),
+			seccomp.AllowValue(0),
+		},
+	},
+	unix.SYS_SENDMMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_RESTART_SYSCALL: {},
+	syscall.SYS_RT_SIGACTION:    {},
+	syscall.SYS_RT_SIGPROCMASK:  {},
+	syscall.SYS_RT_SIGRETURN:    {},
+	syscall.SYS_SCHED_YIELD:     {},
+	syscall.SYS_SENDMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+		},
+	},
+	syscall.SYS_SETITIMER: {},
+	syscall.SYS_SHUTDOWN: []seccomp.Rule{
+		// Used by fs/host to shutdown host sockets.
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+		// Used by unet to shutdown connections.
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+	},
+	syscall.SYS_SIGALTSTACK:     {},
+	unix.SYS_STATX:              {},
+	syscall.SYS_SYNC_FILE_RANGE: {},
+	syscall.SYS_TEE: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(1),                      /* len */
+			seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+		},
+	},
+	syscall.SYS_TGKILL: []seccomp.Rule{
+		{
+			seccomp.AllowValue(uint64(os.Getpid())),
+		},
+	},
+	syscall.SYS_UTIMENSAT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* null pathname */
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* flags */
+		},
+	},
+	syscall.SYS_WRITE: {},
+	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
+	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
+	// option is enabled for a packet socket.
+	syscall.SYS_WRITEV: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(2),
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(3),
+		},
+	},
+}
+
+// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet.
+func hostInetFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_ACCEPT4: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+			},
+		},
+		syscall.SYS_BIND:        {},
+		syscall.SYS_CONNECT:     {},
+		syscall.SYS_GETPEERNAME: {},
+		syscall.SYS_GETSOCKNAME: {},
+		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_ERROR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_KEEPALIVE),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_SNDBUF),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_RCVBUF),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_REUSEADDR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_TYPE),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_LINGER),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_NODELAY),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_INFO),
+			},
+		},
+		syscall.SYS_IOCTL: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.TIOCOUTQ),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.TIOCINQ),
+			},
+		},
+		syscall.SYS_LISTEN:   {},
+		syscall.SYS_READV:    {},
+		syscall.SYS_RECVFROM: {},
+		syscall.SYS_RECVMSG:  {},
+		syscall.SYS_SENDMSG:  {},
+		syscall.SYS_SENDTO:   {},
+		syscall.SYS_SETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_V6ONLY),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_SNDBUF),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_RCVBUF),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_REUSEADDR),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_TCP),
+				seccomp.AllowValue(syscall.TCP_NODELAY),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+		},
+		syscall.SYS_SHUTDOWN: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_RD),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_WR),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SHUT_RDWR),
+			},
+		},
+		syscall.SYS_SOCKET: []seccomp.Rule{
+			{
+				seccomp.AllowValue(syscall.AF_INET),
+				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET),
+				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET6),
+				seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+			{
+				seccomp.AllowValue(syscall.AF_INET6),
+				seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+				seccomp.AllowValue(0),
+			},
+		},
+		syscall.SYS_WRITEV: {},
+	}
+}
+
+func controlServerFilters(fd int) seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_ACCEPT: []seccomp.Rule{
+			{
+				seccomp.AllowValue(fd),
+			},
+		},
+		syscall.SYS_LISTEN: []seccomp.Rule{
+			{
+				seccomp.AllowValue(fd),
+				seccomp.AllowValue(16 /* unet.backlog */),
+			},
+		},
+		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_SOCKET),
+				seccomp.AllowValue(syscall.SO_PEERCRED),
+			},
+		},
+	}
+}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
new file mode 100644
index 000000000..5335ff82c
--- /dev/null
+++ b/runsc/boot/filter/config_amd64.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
+		seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	)
+}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
new file mode 100644
index 000000000..7fa9bbda3
--- /dev/null
+++ b/runsc/boot/filter/config_arm64.go
@@ -0,0 +1,21 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+// Reserve for future customization.
+func init() {
+}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
new file mode 100644
index 000000000..194952a7b
--- /dev/null
+++ b/runsc/boot/filter/config_profile.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
new file mode 100644
index 000000000..e28d4b8d6
--- /dev/null
+++ b/runsc/boot/filter/extra_filters.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !msan,!race
+
+package filter
+
+import (
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by
+// Go instrumentation tools, e.g. -race, -msan.
+// Returns empty when disabled.
+func instrumentationFilters() seccomp.SyscallRules {
+	return nil
+}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
new file mode 100644
index 000000000..209e646a7
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -0,0 +1,34 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build msan
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by MSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+	Report("MSAN is enabled: syscall filters less restrictive!")
+	return seccomp.SyscallRules{
+		syscall.SYS_CLONE:             {},
+		syscall.SYS_MMAP:              {},
+		syscall.SYS_SCHED_GETAFFINITY: {},
+		syscall.SYS_SET_ROBUST_LIST:   {},
+	}
+}
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
new file mode 100644
index 000000000..9ff80276a
--- /dev/null
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// instrumentationFilters returns additional filters for syscalls used by TSAN.
+func instrumentationFilters() seccomp.SyscallRules {
+	Report("TSAN is enabled: syscall filters less restrictive!")
+	return seccomp.SyscallRules{
+		syscall.SYS_BRK:             {},
+		syscall.SYS_CLONE:           {},
+		syscall.SYS_FUTEX:           {},
+		syscall.SYS_MMAP:            {},
+		syscall.SYS_MUNLOCK:         {},
+		syscall.SYS_NANOSLEEP:       {},
+		syscall.SYS_OPEN:            {},
+		syscall.SYS_OPENAT:          {},
+		syscall.SYS_SET_ROBUST_LIST: {},
+		// Used within glibc's malloc.
+		syscall.SYS_TIME: {},
+	}
+}
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
new file mode 100644
index 000000000..e80c171b3
--- /dev/null
+++ b/runsc/boot/filter/filter.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package filter defines all syscalls the sandbox is allowed to make
+// to the host, and installs seccomp filters to prevent prohibited
+// syscalls in case it's compromised.
+package filter
+
+import (
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/seccomp"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+)
+
+// Options are seccomp filter related options.
+type Options struct {
+	Platform      platform.Platform
+	HostNetwork   bool
+	ProfileEnable bool
+	ControllerFD  int
+}
+
+// Install installs seccomp filters for based on the given platform.
+func Install(opt Options) error {
+	s := allowedSyscalls
+	s.Merge(controlServerFilters(opt.ControllerFD))
+
+	// Set of additional filters used by -race and -msan. Returns empty
+	// when not enabled.
+	s.Merge(instrumentationFilters())
+
+	if opt.HostNetwork {
+		Report("host networking enabled: syscall filters less restrictive!")
+		s.Merge(hostInetFilters())
+	}
+	if opt.ProfileEnable {
+		Report("profile enabled: syscall filters less restrictive!")
+		s.Merge(profileFilters())
+	}
+
+	s.Merge(opt.Platform.SyscallFilters())
+
+	return seccomp.Install(s)
+}
+
+// Report writes a warning message to the log.
+func Report(msg string) {
+	log.Warningf("*** SECCOMP WARNING: %s", msg)
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
new file mode 100644
index 000000000..59639ba19
--- /dev/null
+++ b/runsc/boot/fs.go
@@ -0,0 +1,1034 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"syscall"
+
+	// Include filesystem types that OCI spec might mount.
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
+	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	procvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	sysvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/runsc/specutils"
+)
+
+const (
+	// Device name for root mount.
+	rootDevice = "9pfs-/"
+
+	// MountPrefix is the annotation prefix for mount hints.
+	MountPrefix = "dev.gvisor.spec.mount."
+
+	// Supported filesystems that map to different internal filesystem.
+	bind   = "bind"
+	nonefs = "none"
+)
+
+// tmpfs has some extra supported options that we must pass through.
+var tmpfsAllowedData = []string{"mode", "uid", "gid"}
+
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+	// Upper layer uses the same flags as lower, but it must be read-write.
+	upperFlags := lowerFlags
+	upperFlags.ReadOnly = false
+
+	tmpFS := mustFindFilesystem("tmpfs")
+	if !fs.IsDir(lower.StableAttr) {
+		// Create overlay on top of mount file, e.g. /etc/hostname.
+		msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags)
+		return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
+	}
+
+	// Create overlay on top of mount dir.
+	upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
+	}
+
+	// Replicate permissions and owner from lower to upper mount point.
+	attr, err := lower.UnstableAttr(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("reading attributes from lower mount point: %v", err)
+	}
+	if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) {
+		return nil, fmt.Errorf("error setting permission to upper mount point")
+	}
+	if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil {
+		return nil, fmt.Errorf("setting owner to upper mount point: %v", err)
+	}
+
+	return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
+}
+
+// compileMounts returns the supported mounts from the mount spec, adding any
+// mandatory mounts that are required by the OCI specification.
+func compileMounts(spec *specs.Spec) []specs.Mount {
+	// Keep track of whether proc and sys were mounted.
+	var procMounted, sysMounted bool
+	var mounts []specs.Mount
+
+	// Always mount /dev.
+	mounts = append(mounts, specs.Mount{
+		Type:        devtmpfs.Name,
+		Destination: "/dev",
+	})
+
+	mounts = append(mounts, specs.Mount{
+		Type:        devpts.Name,
+		Destination: "/dev/pts",
+	})
+
+	// Mount all submounts from the spec.
+	for _, m := range spec.Mounts {
+		if !specutils.IsSupportedDevMount(m) {
+			log.Warningf("ignoring dev mount at %q", m.Destination)
+			continue
+		}
+		mounts = append(mounts, m)
+		switch filepath.Clean(m.Destination) {
+		case "/proc":
+			procMounted = true
+		case "/sys":
+			sysMounted = true
+		}
+	}
+
+	// Mount proc and sys even if the user did not ask for it, as the spec
+	// says we SHOULD.
+	var mandatoryMounts []specs.Mount
+	if !procMounted {
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        procvfs2.Name,
+			Destination: "/proc",
+		})
+	}
+	if !sysMounted {
+		mandatoryMounts = append(mandatoryMounts, specs.Mount{
+			Type:        sysvfs2.Name,
+			Destination: "/sys",
+		})
+	}
+
+	// The mandatory mounts should be ordered right after the root, in case
+	// there are submounts of these mandatory mounts already in the spec.
+	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
+
+	return mounts
+}
+
+// p9MountData creates a slice of p9 mount data.
+func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
+	opts := []string{
+		"trans=fd",
+		"rfdno=" + strconv.Itoa(fd),
+		"wfdno=" + strconv.Itoa(fd),
+	}
+	if !vfs2 {
+		// privateunixsocket is always enabled in VFS2. VFS1 requires explicit
+		// enablement.
+		opts = append(opts, "privateunixsocket=true")
+	}
+	if fa == FileAccessShared {
+		opts = append(opts, "cache=remote_revalidating")
+	}
+	return opts
+}
+
+// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
+// keys.
+func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
+	var out []string
+	for _, o := range opts {
+		ok, err := parseMountOption(o, allowedKeys...)
+		if err != nil {
+			return nil, err
+		}
+		if ok {
+			out = append(out, o)
+		}
+	}
+	return out, nil
+}
+
+func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
+	kv := strings.SplitN(opt, "=", 3)
+	if len(kv) > 2 {
+		return false, fmt.Errorf("invalid option %q", opt)
+	}
+	return specutils.ContainsStr(allowedKeys, kv[0]), nil
+}
+
+// mountDevice returns a device string based on the fs type and target
+// of the mount.
+func mountDevice(m specs.Mount) string {
+	if m.Type == bind {
+		// Make a device string that includes the target, which is consistent across
+		// S/R and uniquely identifies the connection.
+		return "9pfs-" + m.Destination
+	}
+	// All other fs types use device "none".
+	return "none"
+}
+
+func mountFlags(opts []string) fs.MountSourceFlags {
+	mf := fs.MountSourceFlags{}
+	// Note: changes to supported options must be reflected in
+	// isSupportedMountFlag() as well.
+	for _, o := range opts {
+		switch o {
+		case "rw":
+			mf.ReadOnly = false
+		case "ro":
+			mf.ReadOnly = true
+		case "noatime":
+			mf.NoAtime = true
+		case "noexec":
+			mf.NoExec = true
+		default:
+			log.Warningf("ignoring unknown mount option %q", o)
+		}
+	}
+	return mf
+}
+
+func isSupportedMountFlag(fstype, opt string) bool {
+	switch opt {
+	case "rw", "ro", "noatime", "noexec":
+		return true
+	}
+	if fstype == tmpfsvfs2.Name {
+		ok, err := parseMountOption(opt, tmpfsAllowedData...)
+		return ok && err == nil
+	}
+	return false
+}
+
+func mustFindFilesystem(name string) fs.Filesystem {
+	fs, ok := fs.FindFilesystem(name)
+	if !ok {
+		panic(fmt.Sprintf("could not find filesystem %q", name))
+	}
+	return fs
+}
+
+// addSubmountOverlay overlays the inode over a ramfs tree containing the given
+// paths.
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+	// Construct a ramfs tree of mount points. The contents never
+	// change, so this can be fully caching. There's no real
+	// filesystem backing this tree, so we set the filesystem to
+	// nil.
+	msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{})
+	mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount tree: %v", err)
+	}
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	if err != nil {
+		return nil, fmt.Errorf("adding mount overlay: %v", err)
+	}
+	return overlayInode, err
+}
+
+// subtargets takes a set of Mounts and returns only the targets that are
+// children of the given root. The returned paths are relative to the root.
+func subtargets(root string, mnts []specs.Mount) []string {
+	var targets []string
+	for _, mnt := range mnts {
+		if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
+			targets = append(targets, relPath)
+		}
+	}
+	return targets
+}
+
+func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if conf.VFS2 {
+		return setupContainerVFS2(ctx, conf, mntr, procArgs)
+	}
+	mns, err := mntr.setupFS(conf, procArgs)
+	if err != nil {
+		return err
+	}
+
+	// Set namespace here so that it can be found in ctx.
+	procArgs.MountNamespace = mns
+
+	// Resolve the executable path from working dir and environment.
+	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
+	if err != nil {
+		return err
+	}
+	procArgs.Filename = resolved
+	return nil
+}
+
+func adjustDirentCache(k *kernel.Kernel) error {
+	var hl syscall.Rlimit
+	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
+	}
+	if int64(hl.Cur) != syscall.RLIM_INFINITY {
+		newSize := hl.Cur / 2
+		if newSize < gofer.DefaultDirentCacheSize {
+			log.Infof("Setting gofer dirent cache size to %d", newSize)
+			gofer.DefaultDirentCacheSize = newSize
+			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+		}
+	}
+	return nil
+}
+
+type fdDispenser struct {
+	fds []int
+}
+
+func (f *fdDispenser) remove() int {
+	if f.empty() {
+		panic("fdDispenser out of fds")
+	}
+	rv := f.fds[0]
+	f.fds = f.fds[1:]
+	return rv
+}
+
+func (f *fdDispenser) empty() bool {
+	return len(f.fds) == 0
+}
+
+type shareType int
+
+const (
+	invalid shareType = iota
+
+	// container shareType indicates that the mount is used by a single container.
+	container
+
+	// pod shareType indicates that the mount is used by more than one container
+	// inside the pod.
+	pod
+
+	// shared shareType indicates that the mount can also be shared with a process
+	// outside the pod, e.g. NFS.
+	shared
+)
+
+func parseShare(val string) (shareType, error) {
+	switch val {
+	case "container":
+		return container, nil
+	case "pod":
+		return pod, nil
+	case "shared":
+		return shared, nil
+	default:
+		return 0, fmt.Errorf("invalid share value %q", val)
+	}
+}
+
+func (s shareType) String() string {
+	switch s {
+	case invalid:
+		return "invalid"
+	case container:
+		return "container"
+	case pod:
+		return "pod"
+	case shared:
+		return "shared"
+	default:
+		return fmt.Sprintf("invalid share value %d", s)
+	}
+}
+
+// mountHint represents extra information about mounts that are provided via
+// annotations. They can override mount type, and provide sharing information
+// so that mounts can be correctly shared inside the pod.
+type mountHint struct {
+	name  string
+	share shareType
+	mount specs.Mount
+
+	// root is the inode where the volume is mounted. For mounts with 'pod' share
+	// the volume is mounted once and then bind mounted inside the containers.
+	root *fs.Inode
+
+	// vfsMount is the master mount for the volume. For mounts with 'pod' share
+	// the master volume is bind mounted inside the containers.
+	vfsMount *vfs.Mount
+}
+
+func (m *mountHint) setField(key, val string) error {
+	switch key {
+	case "source":
+		if len(val) == 0 {
+			return fmt.Errorf("source cannot be empty")
+		}
+		m.mount.Source = val
+	case "type":
+		return m.setType(val)
+	case "share":
+		share, err := parseShare(val)
+		if err != nil {
+			return err
+		}
+		m.share = share
+	case "options":
+		return m.setOptions(val)
+	default:
+		return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
+	}
+	return nil
+}
+
+func (m *mountHint) setType(val string) error {
+	switch val {
+	case "tmpfs", "bind":
+		m.mount.Type = val
+	default:
+		return fmt.Errorf("invalid type %q", val)
+	}
+	return nil
+}
+
+func (m *mountHint) setOptions(val string) error {
+	opts := strings.Split(val, ",")
+	if err := specutils.ValidateMountOptions(opts); err != nil {
+		return err
+	}
+	// Sort options so it can be compared with container mount options later on.
+	sort.Strings(opts)
+	m.mount.Options = opts
+	return nil
+}
+
+func (m *mountHint) isSupported() bool {
+	return m.mount.Type == tmpfsvfs2.Name && m.share == pod
+}
+
+// checkCompatible verifies that shared mount is compatible with master.
+// For now enforce that all options are the same. Once bind mount is properly
+// supported, then we should ensure the master is less restrictive than the
+// container, e.g. master can be 'rw' while container mounts as 'ro'.
+func (m *mountHint) checkCompatible(mount specs.Mount) error {
+	// Remove options that don't affect to mount's behavior.
+	masterOpts := filterUnsupportedOptions(m.mount)
+	slaveOpts := filterUnsupportedOptions(mount)
+
+	if len(masterOpts) != len(slaveOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	}
+
+	sort.Strings(masterOpts)
+	sort.Strings(slaveOpts)
+	for i, opt := range masterOpts {
+		if opt != slaveOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		}
+	}
+	return nil
+}
+
+func (m *mountHint) fileAccessType() FileAccessType {
+	if m.share == container {
+		return FileAccessExclusive
+	}
+	return FileAccessShared
+}
+
+func filterUnsupportedOptions(mount specs.Mount) []string {
+	rv := make([]string, 0, len(mount.Options))
+	for _, o := range mount.Options {
+		if isSupportedMountFlag(mount.Type, o) {
+			rv = append(rv, o)
+		}
+	}
+	return rv
+}
+
+// podMountHints contains a collection of mountHints for the pod.
+type podMountHints struct {
+	mounts map[string]*mountHint
+}
+
+func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
+	mnts := make(map[string]*mountHint)
+	for k, v := range spec.Annotations {
+		// Look for 'dev.gvisor.spec.mount' annotations and parse them.
+		if strings.HasPrefix(k, MountPrefix) {
+			// Remove the prefix and split the rest.
+			parts := strings.Split(k[len(MountPrefix):], ".")
+			if len(parts) != 2 {
+				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
+			}
+			name := parts[0]
+			if len(name) == 0 {
+				return nil, fmt.Errorf("invalid mount name: %s", name)
+			}
+			mnt := mnts[name]
+			if mnt == nil {
+				mnt = &mountHint{name: name}
+				mnts[name] = mnt
+			}
+			if err := mnt.setField(parts[1], v); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	// Validate all hints after done parsing.
+	for name, m := range mnts {
+		log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
+		if m.share == invalid {
+			return nil, fmt.Errorf("share field for %q has not been set", m.name)
+		}
+		if len(m.mount.Source) == 0 {
+			return nil, fmt.Errorf("source field for %q has not been set", m.name)
+		}
+		if len(m.mount.Type) == 0 {
+			return nil, fmt.Errorf("type field for %q has not been set", m.name)
+		}
+
+		// Check for duplicate mount sources.
+		for name2, m2 := range mnts {
+			if name != name2 && m.mount.Source == m2.mount.Source {
+				return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
+			}
+		}
+	}
+
+	return &podMountHints{mounts: mnts}, nil
+}
+
+func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
+	for _, m := range p.mounts {
+		if m.mount.Source == mount.Source {
+			return m
+		}
+	}
+	return nil
+}
+
+type containerMounter struct {
+	root *specs.Root
+
+	// mounts is the set of submounts for the container. It's a copy from the spec
+	// that may be freely modified without affecting the original spec.
+	mounts []specs.Mount
+
+	// fds is the list of FDs to be dispensed for mounts that require it.
+	fds fdDispenser
+
+	k *kernel.Kernel
+
+	hints *podMountHints
+}
+
+func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+	return &containerMounter{
+		root:   spec.Root,
+		mounts: compileMounts(spec),
+		fds:    fdDispenser{fds: goferFDs},
+		k:      k,
+		hints:  hints,
+	}
+}
+
+// processHints processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
+	if conf.VFS2 {
+		return c.processHintsVFS2(conf, creds)
+	}
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfsvfs2.Name {
+			continue
+		}
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		inode, err := c.mountSharedMaster(ctx, conf, hint)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.root = inode
+	}
+	return nil
+}
+
+// setupFS is used to set up the file system for all containers. This is the
+// main entry point method, with most of the other being internal only. It
+// returns the mount namespace that is created for the container.
+func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+	log.Infof("Configuring container's file system")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := rootProcArgs.NewContext(c.k)
+
+	mns, err := c.createMountNamespace(rootCtx, conf)
+	if err != nil {
+		return nil, err
+	}
+
+	// Set namespace here so that it can be found in rootCtx.
+	rootProcArgs.MountNamespace = mns
+
+	if err := c.mountSubmounts(rootCtx, conf, mns); err != nil {
+		return nil, err
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+	rootInode, err := c.createRootMount(ctx, conf)
+	if err != nil {
+		return nil, fmt.Errorf("creating filesystem for container: %v", err)
+	}
+	mns, err := fs.NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		return nil, fmt.Errorf("creating new mount namespace for container: %v", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+	root := mns.Root()
+	defer root.DecRef()
+
+	for _, m := range c.mounts {
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
+		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+			}
+		}
+	}
+
+	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+		return fmt.Errorf("mount submount %q: %v", "tmp", err)
+	}
+
+	if err := c.checkDispenser(); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (c *containerMounter) checkDispenser() error {
+	if !c.fds.empty() {
+		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
+	}
+	return nil
+}
+
+// mountSharedMaster mounts the master of a volume that is shared among
+// containers in a pod. It returns the root mount's inode.
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+
+	// Mount with revalidate because it's shared among containers.
+	opts = append(opts, "cache=revalidate")
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(hint.mount.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of shared mount %q", hint.name)
+		inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return inode, nil
+}
+
+// createRootMount creates the root filesystem.
+func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+	// First construct the filesystem from the spec.Root.
+	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
+
+	fd := c.fds.remove()
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	p9FS := mustFindFilesystem("9p")
+	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
+
+	if conf.OverlayfsStaleRead {
+		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
+		// can only send mount options for specs.Mounts (specs.Root is missing
+		// Options field). So assume root is always on top of overlayfs.
+		opts = append(opts, "overlayfs_stale_read")
+	}
+
+	rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating root mount point: %v", err)
+	}
+
+	// We need to overlay the root on top of a ramfs with stub directories
+	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
+	// mounted even if they are not in the spec.
+	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("adding submount overlay: %v", err)
+	}
+
+	if conf.Overlay && !c.root.Readonly {
+		log.Debugf("Adding overlay on top of root mount")
+		// Overlay a tmpfs filesystem on top of the root.
+		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	log.Infof("Mounted %q to %q type root", c.root.Path, "/")
+	return rootInode, nil
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+	)
+
+	switch m.Type {
+	case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
+		fsName = m.Type
+	case nonefs:
+		fsName = sysvfs2.Name
+	case tmpfsvfs2.Name:
+		fsName = m.Type
+
+		var err error
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
+		if err != nil {
+			return "", nil, false, err
+		}
+
+	case bind:
+		fd := c.fds.remove()
+		fsName = gofervfs2.Name
+		opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2)
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+	default:
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, opts, useOverlay, nil
+}
+
+func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+	if hint := c.hints.findMount(mount); hint != nil {
+		return hint.fileAccessType()
+	}
+	// Non-root bind mounts are always shared if no hints were provided.
+	return FileAccessShared
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(m.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		err := fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		// Check to see if this is a common error due to a Linux bug.
+		// This error is generated here in order to cause it to be
+		// printed to the user using Docker via 'runsc create' etc. rather
+		// than simply printed to the logs for the 'runsc boot' command.
+		//
+		// We check the error message string rather than type because the
+		// actual error types (syscall.EIO, syscall.EPIPE) are lost by file system
+		// implementation (e.g. p9).
+		// TODO(gvisor.dev/issue/1765): Remove message when bug is resolved.
+		if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) {
+			return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug"))
+		}
+		return err
+	}
+
+	// If there are submounts, we need to overlay the mount on top of a ramfs
+	// with stub directories for submount paths.
+	submounts := subtargets(m.Destination, c.mounts)
+	if len(submounts) > 0 {
+		log.Infof("Adding submount overlay over %q", m.Destination)
+		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		if err != nil {
+			return fmt.Errorf("adding submount overlay: %v", err)
+		}
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of mount %q", m.Destination)
+		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+		if err != nil {
+			return err
+		}
+	}
+
+	maxTraversals := uint(0)
+	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+	}
+	defer dirent.DecRef()
+	if err := mns.Mount(ctx, dirent, inode); err != nil {
+		return fmt.Errorf("mount %q error: %v", m.Destination, err)
+	}
+
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
+	return nil
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
+	if err := source.checkCompatible(mount); err != nil {
+		return err
+	}
+
+	maxTraversals := uint(0)
+	target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
+	}
+	defer target.DecRef()
+
+	// Take a ref on the inode that is about to be (re)-mounted.
+	source.root.IncRef()
+	if err := mns.Mount(ctx, target, source.root); err != nil {
+		source.root.DecRef()
+		return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
+	}
+
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return nil
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	newMount := fs.MountArgs{
+		Dev:        mountDevice(m),
+		Flags:      mountFlags(m.Options),
+		DataString: strings.Join(opts, ","),
+	}
+	if useOverlay {
+		newMount.Flags.ReadOnly = true
+	}
+	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+	log.Infof("Added mount at %q: %+v", fsName, newMount)
+	return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
+// the mounts to the environment.
+func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+	renv := &fs.RestoreEnvironment{
+		MountSources: make(map[string][]fs.MountArgs),
+	}
+
+	// Add root mount.
+	fd := c.fds.remove()
+	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
+
+	mf := fs.MountSourceFlags{}
+	if c.root.Readonly || conf.Overlay {
+		mf.ReadOnly = true
+	}
+
+	rootMount := fs.MountArgs{
+		Dev:        rootDevice,
+		Flags:      mf,
+		DataString: strings.Join(opts, ","),
+	}
+	renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount)
+
+	// Add submounts.
+	var tmpMounted bool
+	for _, m := range c.mounts {
+		if err := c.addRestoreMount(conf, renv, m); err != nil {
+			return nil, err
+		}
+		if filepath.Clean(m.Destination) == "/tmp" {
+			tmpMounted = true
+		}
+	}
+
+	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+	if !tmpMounted {
+		tmpMount := specs.Mount{
+			Type:        tmpfsvfs2.Name,
+			Destination: "/tmp",
+		}
+		if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
+			return nil, err
+		}
+	}
+
+	return renv, nil
+}
+
+// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+//   1. /tmp is mounted explicitly: we should not override user's wish
+//   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+	for _, m := range c.mounts {
+		if filepath.Clean(m.Destination) == "/tmp" {
+			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
+			return nil
+		}
+	}
+
+	maxTraversals := uint(0)
+	tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
+	switch err {
+	case nil:
+		// Found '/tmp' in filesystem, check if it's empty.
+		defer tmp.DecRef()
+		f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
+		if err != nil {
+			return err
+		}
+		defer f.DecRef()
+		serializer := &fs.CollectEntriesSerializer{}
+		if err := f.Readdir(ctx, serializer); err != nil {
+			return err
+		}
+		// If more than "." and ".." is found, skip internal tmpfs to prevent hiding
+		// existing files.
+		if len(serializer.Order) > 2 {
+			log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
+			return nil
+		}
+		log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
+		fallthrough
+
+	case syserror.ENOENT:
+		// No '/tmp' found (or fallthrough from above). Safe to mount internal
+		// tmpfs.
+		tmpMount := specs.Mount{
+			Type:        tmpfsvfs2.Name,
+			Destination: "/tmp",
+			// Sticky bit is added to prevent accidental deletion of files from
+			// another user. This is normally done for /tmp.
+			Options: []string{"mode=01777"},
+		}
+		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
+
+	default:
+		return err
+	}
+}
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
new file mode 100644
index 000000000..912037075
--- /dev/null
+++ b/runsc/boot/fs_test.go
@@ -0,0 +1,250 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestPodMountHintsHappy(t *testing.T) {
+	spec := &specs.Spec{
+		Annotations: map[string]string{
+			MountPrefix + "mount1.source": "foo",
+			MountPrefix + "mount1.type":   "tmpfs",
+			MountPrefix + "mount1.share":  "pod",
+
+			MountPrefix + "mount2.source":  "bar",
+			MountPrefix + "mount2.type":    "bind",
+			MountPrefix + "mount2.share":   "container",
+			MountPrefix + "mount2.options": "rw,private",
+		},
+	}
+	podHints, err := newPodMountHints(spec)
+	if err != nil {
+		t.Fatalf("newPodMountHints failed: %v", err)
+	}
+
+	// Check that fields were set correctly.
+	mount1 := podHints.mounts["mount1"]
+	if want := "mount1"; want != mount1.name {
+		t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name)
+	}
+	if want := "foo"; want != mount1.mount.Source {
+		t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source)
+	}
+	if want := "tmpfs"; want != mount1.mount.Type {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type)
+	}
+	if want := pod; want != mount1.share {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share)
+	}
+	if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options)
+	}
+
+	mount2 := podHints.mounts["mount2"]
+	if want := "mount2"; want != mount2.name {
+		t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name)
+	}
+	if want := "bar"; want != mount2.mount.Source {
+		t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source)
+	}
+	if want := "bind"; want != mount2.mount.Type {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type)
+	}
+	if want := container; want != mount2.share {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share)
+	}
+	if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options)
+	}
+}
+
+func TestPodMountHintsErrors(t *testing.T) {
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		error       string
+	}{
+		{
+			name: "too short",
+			annotations: map[string]string{
+				MountPrefix + "mount1": "foo",
+			},
+			error: "invalid mount annotation",
+		},
+		{
+			name: "no name",
+			annotations: map[string]string{
+				MountPrefix + ".source": "foo",
+			},
+			error: "invalid mount name",
+		},
+		{
+			name: "missing source",
+			annotations: map[string]string{
+				MountPrefix + "mount1.type":  "tmpfs",
+				MountPrefix + "mount1.share": "pod",
+			},
+			error: "source field",
+		},
+		{
+			name: "missing type",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			error: "type field",
+		},
+		{
+			name: "missing share",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+			},
+			error: "share field",
+		},
+		{
+			name: "invalid field name",
+			annotations: map[string]string{
+				MountPrefix + "mount1.invalid": "foo",
+			},
+			error: "invalid mount annotation",
+		},
+		{
+			name: "invalid source",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			error: "source cannot be empty",
+		},
+		{
+			name: "invalid type",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "invalid-type",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			error: "invalid type",
+		},
+		{
+			name: "invalid share",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "invalid-share",
+			},
+			error: "invalid share",
+		},
+		{
+			name: "invalid options",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source":  "foo",
+				MountPrefix + "mount1.type":    "tmpfs",
+				MountPrefix + "mount1.share":   "pod",
+				MountPrefix + "mount1.options": "invalid-option",
+			},
+			error: "unknown mount option",
+		},
+		{
+			name: "duplicate source",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
+
+				MountPrefix + "mount2.source": "foo",
+				MountPrefix + "mount2.type":   "bind",
+				MountPrefix + "mount2.share":  "container",
+			},
+			error: "have the same mount source",
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err == nil || !strings.Contains(err.Error(), tst.error) {
+				t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err)
+			}
+			if podHints != nil {
+				t.Errorf("newPodMountHints must return nil on failure: %+v", podHints)
+			}
+		})
+	}
+}
+
+func TestGetMountAccessType(t *testing.T) {
+	const source = "foo"
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		want        FileAccessType
+	}{
+		{
+			name: "container=exclusive",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
+			},
+			want: FileAccessExclusive,
+		},
+		{
+			name: "pod=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "shared=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "shared",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "default=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source + "mismatch",
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
+			},
+			want: FileAccessShared,
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err != nil {
+				t.Fatalf("newPodMountHints failed: %v", err)
+			}
+			mounter := containerMounter{hints: podHints}
+			if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want {
+				t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got)
+			}
+		})
+	}
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
new file mode 100644
index 000000000..ce62236e5
--- /dev/null
+++ b/runsc/boot/limits.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"syscall"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Mapping from linux resource names to limits.LimitType.
+var fromLinuxResource = map[string]limits.LimitType{
+	"RLIMIT_AS":         limits.AS,
+	"RLIMIT_CORE":       limits.Core,
+	"RLIMIT_CPU":        limits.CPU,
+	"RLIMIT_DATA":       limits.Data,
+	"RLIMIT_FSIZE":      limits.FileSize,
+	"RLIMIT_LOCKS":      limits.Locks,
+	"RLIMIT_MEMLOCK":    limits.MemoryLocked,
+	"RLIMIT_MSGQUEUE":   limits.MessageQueueBytes,
+	"RLIMIT_NICE":       limits.Nice,
+	"RLIMIT_NOFILE":     limits.NumberOfFiles,
+	"RLIMIT_NPROC":      limits.ProcessCount,
+	"RLIMIT_RSS":        limits.Rss,
+	"RLIMIT_RTPRIO":     limits.RealTimePriority,
+	"RLIMIT_RTTIME":     limits.Rttime,
+	"RLIMIT_SIGPENDING": limits.SignalsPending,
+	"RLIMIT_STACK":      limits.Stack,
+}
+
+func findName(lt limits.LimitType) string {
+	for k, v := range fromLinuxResource {
+		if v == lt {
+			return k
+		}
+	}
+	return "unknown"
+}
+
+var defaults defs
+
+type defs struct {
+	mu  sync.Mutex
+	set *limits.LimitSet
+	err error
+}
+
+func (d *defs) get() (*limits.LimitSet, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.err != nil {
+		return nil, d.err
+	}
+	if d.set == nil {
+		if err := d.initDefaults(); err != nil {
+			d.err = err
+			return nil, err
+		}
+	}
+	return d.set, nil
+}
+
+func (d *defs) initDefaults() error {
+	ls, err := limits.NewLinuxLimitSet()
+	if err != nil {
+		return err
+	}
+
+	// Set default limits based on what containers get by default, ex:
+	// $ docker run --rm debian prlimit
+	ls.SetUnchecked(limits.AS, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Core, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.CPU, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536})
+	ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200})
+	ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576})
+	ls.SetUnchecked(limits.ProcessCount, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.Rss, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.RealTimePriority, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.Rttime, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity})
+	ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0})
+	ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity})
+
+	// Read host limits that directly affect the sandbox and adjust the defaults
+	// based on them.
+	for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} {
+		var hl syscall.Rlimit
+		if err := syscall.Getrlimit(res, &hl); err != nil {
+			return err
+		}
+
+		lt, ok := limits.FromLinuxResource[res]
+		if !ok {
+			return fmt.Errorf("unknown rlimit type %v", res)
+		}
+		hostLimit := limits.Limit{
+			Cur: limits.FromLinux(hl.Cur),
+			Max: limits.FromLinux(hl.Max),
+		}
+
+		defaultLimit := ls.Get(lt)
+		if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur {
+			log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur)
+		}
+		if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max {
+			log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max)
+			ls.SetUnchecked(lt, hostLimit)
+		}
+	}
+
+	d.set = ls
+	return nil
+}
+
+func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
+	ls, err := defaults.get()
+	if err != nil {
+		return nil, err
+	}
+
+	// Then apply overwrites on top of defaults.
+	for _, rl := range spec.Process.Rlimits {
+		lt, ok := fromLinuxResource[rl.Type]
+		if !ok {
+			return nil, fmt.Errorf("unknown resource %q", rl.Type)
+		}
+		ls.SetUnchecked(lt, limits.Limit{
+			Cur: rl.Soft,
+			Max: rl.Hard,
+		})
+	}
+	return ls, nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
new file mode 100644
index 000000000..0c0423ab2
--- /dev/null
+++ b/runsc/boot/loader.go
@@ -0,0 +1,1284 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package boot loads the kernel and runs a container.
+package boot
+
+import (
+	"fmt"
+	mrand "math/rand"
+	"os"
+	"runtime"
+	"sync/atomic"
+	"syscall"
+	gtime "time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/fdimport"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/sighandling"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/runsc/boot/filter"
+	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/specutils"
+
+	// Include supported socket providers.
+	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
+)
+
+// Loader keeps state needed to start the kernel and run the container..
+type Loader struct {
+	// k is the kernel.
+	k *kernel.Kernel
+
+	// ctrl is the control server.
+	ctrl *controller
+
+	conf *Config
+
+	// console is set to true if terminal is enabled.
+	console bool
+
+	watchdog *watchdog.Watchdog
+
+	// stdioFDs contains stdin, stdout, and stderr.
+	stdioFDs []int
+
+	// goferFDs are the FDs that attach the sandbox to the gofers.
+	goferFDs []int
+
+	// spec is the base configuration for the root container.
+	spec *specs.Spec
+
+	// stopSignalForwarding disables forwarding of signals to the sandboxed
+	// container. It should be called when a sandbox is destroyed.
+	stopSignalForwarding func()
+
+	// restore is set to true if we are restoring a container.
+	restore bool
+
+	// rootProcArgs refers to the root sandbox init task.
+	rootProcArgs kernel.CreateProcessArgs
+
+	// sandboxID is the ID for the whole sandbox.
+	sandboxID string
+
+	// mu guards processes.
+	mu sync.Mutex
+
+	// processes maps containers init process and invocation of exec. Root
+	// processes are keyed with container ID and pid=0, while exec invocations
+	// have the corresponding pid set.
+	//
+	// processes is guardded by mu.
+	processes map[execID]*execProcess
+
+	// mountHints provides extra information about mounts for containers that
+	// apply to the entire pod.
+	mountHints *podMountHints
+}
+
+// execID uniquely identifies a sentry process that is executed in a container.
+type execID struct {
+	cid string
+	pid kernel.ThreadID
+}
+
+// execProcess contains the thread group and host TTY of a sentry process.
+type execProcess struct {
+	// tg will be nil for containers that haven't started yet.
+	tg *kernel.ThreadGroup
+
+	// tty will be nil if the process is not attached to a terminal.
+	tty *host.TTYFileOperations
+
+	// tty will be nil if the process is not attached to a terminal.
+	ttyVFS2 *hostvfs2.TTYFileDescription
+
+	// pidnsPath is the pid namespace path in spec
+	pidnsPath string
+}
+
+func init() {
+	// Initialize the random number generator.
+	mrand.Seed(gtime.Now().UnixNano())
+}
+
+// Args are the arguments for New().
+type Args struct {
+	// Id is the sandbox ID.
+	ID string
+	// Spec is the sandbox specification.
+	Spec *specs.Spec
+	// Conf is the system configuration.
+	Conf *Config
+	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
+	// of this FD and may close it at any time.
+	ControllerFD int
+	// Device is an optional argument that is passed to the platform. The Loader
+	// takes ownership of this file and may close it at any time.
+	Device *os.File
+	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
+	// takes ownership of these FDs and may close them at any time.
+	GoferFDs []int
+	// StdioFDs is the stdio for the application. The Loader takes ownership of
+	// these FDs and may close them at any time.
+	StdioFDs []int
+	// Console is set to true if using TTY.
+	Console bool
+	// NumCPU is the number of CPUs to create inside the sandbox.
+	NumCPU int
+	// TotalMem is the initial amount of total memory to report back to the
+	// container.
+	TotalMem uint64
+	// UserLogFD is the file descriptor to write user logs to.
+	UserLogFD int
+}
+
+// make sure stdioFDs are always the same on initial start and on restore
+const startingStdioFD = 64
+
+// New initializes a new kernel loader configured by spec.
+// New also handles setting up a kernel for restoring a container.
+func New(args Args) (*Loader, error) {
+	// We initialize the rand package now to make sure /dev/urandom is pre-opened
+	// on kernels that do not support getrandom(2).
+	if err := rand.Init(); err != nil {
+		return nil, fmt.Errorf("setting up rand: %v", err)
+	}
+
+	if err := usage.Init(); err != nil {
+		return nil, fmt.Errorf("setting up memory usage: %v", err)
+	}
+
+	// Is this a VFSv2 kernel?
+	if args.Conf.VFS2 {
+		kernel.VFS2Enabled = true
+		vfs2.Override()
+	}
+
+	// Create kernel and platform.
+	p, err := createPlatform(args.Conf, args.Device)
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+	k := &kernel.Kernel{
+		Platform: p,
+	}
+
+	// Create memory file.
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
+
+	// Create VDSO.
+	//
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	vdso, err := loader.PrepareVDSO(k)
+	if err != nil {
+		return nil, fmt.Errorf("creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	if err := enableStrace(args.Conf); err != nil {
+		return nil, fmt.Errorf("enabling strace: %v", err)
+	}
+
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
+	if err != nil {
+		return nil, fmt.Errorf("creating network: %v", err)
+	}
+
+	// Create capabilities.
+	caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
+	if err != nil {
+		return nil, fmt.Errorf("converting capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
+	for _, GID := range args.Spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials.
+	creds := auth.NewUserCredentials(
+		auth.KUID(args.Spec.Process.User.UID),
+		auth.KGID(args.Spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		auth.NewRootUserNamespace())
+
+	if args.NumCPU == 0 {
+		args.NumCPU = runtime.NumCPU()
+	}
+	log.Infof("CPUs: %d", args.NumCPU)
+
+	if args.TotalMem > 0 {
+		// Adjust the total memory returned by the Sentry so that applications that
+		// use /proc/meminfo can make allocations based on this limit.
+		usage.MinimumTotalMemoryBytes = args.TotalMem
+		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
+	}
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		FeatureSet:                  cpuid.HostFeatureSet(),
+		Timekeeper:                  tk,
+		RootUserNamespace:           creds.UserNamespace,
+		RootNetworkNamespace:        netns,
+		ApplicationCores:            uint(args.NumCPU),
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
+	}); err != nil {
+		return nil, fmt.Errorf("initializing kernel: %v", err)
+	}
+
+	if kernel.VFS2Enabled {
+		if err := registerFilesystems(k); err != nil {
+			return nil, fmt.Errorf("registering filesystems: %w", err)
+		}
+	}
+
+	if err := adjustDirentCache(k); err != nil {
+		return nil, err
+	}
+
+	// Turn on packet logging if enabled.
+	if args.Conf.LogPackets {
+		log.Infof("Packet logging enabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 1)
+	} else {
+		log.Infof("Packet logging disabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 0)
+	}
+
+	// Create a watchdog.
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
+
+	procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+	if err != nil {
+		return nil, fmt.Errorf("creating init process for root container: %v", err)
+	}
+
+	if err := initCompatLogs(args.UserLogFD); err != nil {
+		return nil, fmt.Errorf("initializing compat logs: %v", err)
+	}
+
+	mountHints, err := newPodMountHints(args.Spec)
+	if err != nil {
+		return nil, fmt.Errorf("creating pod mount hints: %v", err)
+	}
+
+	if kernel.VFS2Enabled {
+		// Set up host mount that will be used for imported fds.
+		hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS())
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
+		}
+		defer hostFilesystem.DecRef()
+		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
+		}
+		k.SetHostMount(hostMount)
+	}
+
+	// Make host FDs stable between invocations. Host FDs must map to the exact
+	// same number when the sandbox is restored. Otherwise the wrong FD will be
+	// used.
+	var stdioFDs []int
+	newfd := startingStdioFD
+	for _, fd := range args.StdioFDs {
+		err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+		}
+		stdioFDs = append(stdioFDs, newfd)
+		err = syscall.Close(fd)
+		if err != nil {
+			return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+		}
+		newfd++
+	}
+
+	eid := execID{cid: args.ID}
+	l := &Loader{
+		k:            k,
+		conf:         args.Conf,
+		console:      args.Console,
+		watchdog:     dog,
+		spec:         args.Spec,
+		goferFDs:     args.GoferFDs,
+		stdioFDs:     stdioFDs,
+		rootProcArgs: procArgs,
+		sandboxID:    args.ID,
+		processes:    map[execID]*execProcess{eid: {}},
+		mountHints:   mountHints,
+	}
+
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
+	}
+
+	// Create the control server using the provided FD.
+	//
+	// This must be done *after* we have initialized the kernel since the
+	// controller is used to configure the kernel's network stack.
+	ctrl, err := newController(args.ControllerFD, l)
+	if err != nil {
+		return nil, fmt.Errorf("creating control server: %v", err)
+	}
+	l.ctrl = ctrl
+
+	// Only start serving after Loader is set to controller and controller is set
+	// to Loader, because they are both used in the urpc methods.
+	if err := ctrl.srv.StartServing(); err != nil {
+		return nil, fmt.Errorf("starting control server: %v", err)
+	}
+
+	return l, nil
+}
+
+// newProcess creates a process that can be run with kernel.CreateProcess.
+func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+	// Create initial limits.
+	ls, err := createLimitSet(spec)
+	if err != nil {
+		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
+	}
+
+	wd := spec.Process.Cwd
+	if wd == "" {
+		wd = "/"
+	}
+
+	// Create the process arguments.
+	procArgs := kernel.CreateProcessArgs{
+		Argv:                    spec.Process.Args,
+		Envv:                    spec.Process.Env,
+		WorkingDirectory:        wd,
+		Credentials:             creds,
+		Umask:                   0022,
+		Limits:                  ls,
+		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
+		UTSNamespace:            k.RootUTSNamespace(),
+		IPCNamespace:            k.RootIPCNamespace(),
+		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
+		ContainerID:             id,
+		PIDNamespace:            pidns,
+	}
+
+	return procArgs, nil
+}
+
+// Destroy cleans up all resources used by the loader.
+//
+// Note that this will block until all open control server connections have
+// been closed. For that reason, this should NOT be called in a defer, because
+// a panic in a control server rpc would then hang forever.
+func (l *Loader) Destroy() {
+	if l.ctrl != nil {
+		l.ctrl.srv.Stop()
+	}
+	if l.stopSignalForwarding != nil {
+		l.stopSignalForwarding()
+	}
+	l.watchdog.Stop()
+}
+
+func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+	p, err := platform.Lookup(conf.Platform)
+	if err != nil {
+		panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
+	}
+	log.Infof("Platform: %s", conf.Platform)
+	return p.New(deviceFile)
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "runsc-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
+	// there are memory cgroups specified, because at this point we're already
+	// in a mount namespace in which the relevant cgroupfs is not visible.
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
+
+func (l *Loader) installSeccompFilters() error {
+	if l.conf.DisableSeccomp {
+		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+	} else {
+		opts := filter.Options{
+			Platform:      l.k.Platform,
+			HostNetwork:   l.conf.Network == NetworkHost,
+			ProfileEnable: l.conf.ProfileEnable,
+			ControllerFD:  l.ctrl.srv.FD(),
+		}
+		if err := filter.Install(opts); err != nil {
+			return fmt.Errorf("installing seccomp filters: %v", err)
+		}
+	}
+	return nil
+}
+
+// Run runs the root container.
+func (l *Loader) Run() error {
+	err := l.run()
+	l.ctrl.manager.startResultChan <- err
+	if err != nil {
+		// Give the controller some time to send the error to the
+		// runtime. If we return too quickly here the process will exit
+		// and the control connection will be closed before the error
+		// is returned.
+		gtime.Sleep(2 * gtime.Second)
+		return err
+	}
+	return nil
+}
+
+func (l *Loader) run() error {
+	if l.conf.Network == NetworkHost {
+		// Delay host network configuration to this point because network namespace
+		// is configured after the loader is created and before Run() is called.
+		log.Debugf("Configuring host network")
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
+		if err := stack.Configure(); err != nil {
+			return err
+		}
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: l.sandboxID}
+	ep, ok := l.processes[eid]
+	if !ok {
+		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
+	}
+
+	// If we are restoring, we do not want to create a process.
+	// l.restore is set by the container manager when a restore call is made.
+	var ttyFile *host.TTYFileOperations
+	var ttyFileVFS2 *hostvfs2.TTYFileDescription
+	if !l.restore {
+		if l.conf.ProfileEnable {
+			pprof.Initialize()
+		}
+
+		// Finally done with all configuration. Setup filters before user code
+		// is loaded.
+		if err := l.installSeccompFilters(); err != nil {
+			return err
+		}
+
+		// Create the FD map, which will set stdin, stdout, and stderr.  If console
+		// is true, then ioctl calls will be passed through to the host fd.
+		ctx := l.rootProcArgs.NewContext(l.k)
+		var err error
+
+		// CreateProcess takes a reference on FDMap if successful. We won't need
+		// ours either way.
+		l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
+		if err != nil {
+			return fmt.Errorf("importing fds: %v", err)
+		}
+
+		// Setup the root container file system.
+		l.startGoferMonitor(l.sandboxID, l.goferFDs)
+
+		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+		if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil {
+			return err
+		}
+		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
+			return err
+		}
+
+		// Add the HOME enviroment variable if it is not already set.
+		var envv []string
+		if kernel.VFS2Enabled {
+			envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+
+		} else {
+			envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		}
+		if err != nil {
+			return err
+		}
+		l.rootProcArgs.Envv = envv
+
+		// Create the root container init task. It will begin running
+		// when the kernel is started.
+		if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+			return fmt.Errorf("creating init process: %v", err)
+		}
+
+		// CreateProcess takes a reference on FDTable if successful.
+		l.rootProcArgs.FDTable.DecRef()
+	}
+
+	ep.tg = l.k.GlobalInit()
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+		ep.pidnsPath = ns.Path
+	}
+	if l.console {
+		// Set the foreground process group on the TTY to the global init process
+		// group, since that is what we are about to start running.
+		switch {
+		case ttyFileVFS2 != nil:
+			ep.ttyVFS2 = ttyFileVFS2
+			ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		case ttyFile != nil:
+			ep.tty = ttyFile
+			ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		}
+	}
+
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
+
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if l.console {
+			// Since we are running with a console, we should forward the signal to
+			// the foreground process group so that job control signals like ^C can
+			// be handled properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+		}
+	})
+
+	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
+	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
+	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
+	// passed FDs, so only close for VFS1.
+	if !kernel.VFS2Enabled {
+		for _, fd := range l.stdioFDs {
+			err := syscall.Close(fd)
+			if err != nil {
+				return fmt.Errorf("close dup()ed stdioFDs: %v", err)
+			}
+		}
+	}
+
+	log.Infof("Process should have started...")
+	l.watchdog.Start()
+	return l.k.Start()
+}
+
+// createContainer creates a new container inside the sandbox.
+func (l *Loader) createContainer(cid string) error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; ok {
+		return fmt.Errorf("container %q already exists", cid)
+	}
+	l.processes[eid] = &execProcess{}
+	return nil
+}
+
+// startContainer starts a child container. It returns the thread group ID of
+// the newly created process. Caller owns 'files' and may close them after
+// this method returns.
+func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+	// Create capabilities.
+	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
+	if err != nil {
+		return fmt.Errorf("creating capabilities: %v", err)
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; !ok {
+		return fmt.Errorf("trying to start a deleted container %q", cid)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+	for _, GID := range spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials. We reuse the root user namespace because the
+	// sentry currently supports only 1 mount namespace, which is tied to a
+	// single user namespace. Thus we must run in the same user namespace
+	// to access mounts.
+	creds := auth.NewUserCredentials(
+		auth.KUID(spec.Process.User.UID),
+		auth.KGID(spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		l.k.RootUserNamespace())
+
+	var pidns *kernel.PIDNamespace
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok {
+		if ns.Path != "" {
+			for _, p := range l.processes {
+				if ns.Path == p.pidnsPath {
+					pidns = p.tg.PIDNamespace()
+					break
+				}
+			}
+		}
+		if pidns == nil {
+			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
+		}
+		l.processes[eid].pidnsPath = ns.Path
+	} else {
+		pidns = l.k.RootPIDNamespace()
+	}
+	procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+	if err != nil {
+		return fmt.Errorf("creating new process: %v", err)
+	}
+
+	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
+	var stdioFDs []int
+	for _, f := range files[:3] {
+		stdioFDs = append(stdioFDs, int(f.Fd()))
+	}
+
+	// Create the FD map, which will set stdin, stdout, and stderr.
+	ctx := procArgs.NewContext(l.k)
+	fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
+	if err != nil {
+		return fmt.Errorf("importing fds: %v", err)
+	}
+	// CreateProcess takes a reference on fdTable if successful. We won't
+	// need ours either way.
+	procArgs.FDTable = fdTable
+
+	// Can't take ownership away from os.File. dup them to get a new FDs.
+	var goferFDs []int
+	for _, f := range files[3:] {
+		fd, err := syscall.Dup(int(f.Fd()))
+		if err != nil {
+			return fmt.Errorf("failed to dup file: %v", err)
+		}
+		goferFDs = append(goferFDs, fd)
+	}
+
+	// Setup the child container file system.
+	l.startGoferMonitor(cid, goferFDs)
+
+	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
+	if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
+		return err
+	}
+
+	// Add the HOME enviroment variable if it is not already set.
+	var envv []string
+	if kernel.VFS2Enabled {
+		envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2,
+			procArgs.Credentials.RealKUID, procArgs.Envv)
+
+	} else {
+		envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace,
+			procArgs.Credentials.RealKUID, procArgs.Envv)
+	}
+	if err != nil {
+		return err
+	}
+	procArgs.Envv = envv
+
+	// Create and start the new process.
+	tg, _, err := l.k.CreateProcess(procArgs)
+	if err != nil {
+		return fmt.Errorf("creating process: %v", err)
+	}
+	l.k.StartProcess(tg)
+
+	// CreateProcess takes a reference on FDTable if successful.
+	procArgs.FDTable.DecRef()
+
+	l.processes[eid].tg = tg
+	return nil
+}
+
+// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
+// the gofer FDs looking for disconnects, and destroys the container if a
+// disconnect occurs in any of the gofer FDs.
+func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+	go func() {
+		log.Debugf("Monitoring gofer health for container %q", cid)
+		var events []unix.PollFd
+		for _, fd := range goferFDs {
+			events = append(events, unix.PollFd{
+				Fd:     int32(fd),
+				Events: unix.POLLHUP | unix.POLLRDHUP,
+			})
+		}
+		_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
+			// Use ppoll instead of poll because it's already whilelisted in seccomp.
+			n, err := unix.Ppoll(events, nil, nil)
+			return uintptr(n), 0, err
+		})
+		if err != nil {
+			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
+		}
+
+		// Check if the gofer has stopped as part of normal container destruction.
+		// This is done just to avoid sending an annoying error message to the log.
+		// Note that there is a small race window in between mu.Unlock() and the
+		// lock being reacquired in destroyContainer(), but it's harmless to call
+		// destroyContainer() multiple times.
+		l.mu.Lock()
+		_, ok := l.processes[execID{cid: cid}]
+		l.mu.Unlock()
+		if ok {
+			log.Infof("Gofer socket disconnected, destroying container %q", cid)
+			if err := l.destroyContainer(cid); err != nil {
+				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+			}
+		}
+	}()
+}
+
+// destroyContainer stops a container if it is still running and cleans up its
+// filesystem.
+func (l *Loader) destroyContainer(cid string) error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
+	if err != nil {
+		// Container doesn't exist.
+		return err
+	}
+
+	// The container exists, but has it been started?
+	if tg != nil {
+		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
+		}
+		// Wait for all processes that belong to the container to exit (including
+		// exec'd processes).
+		for _, t := range l.k.TaskSet().Root.Tasks() {
+			if t.ContainerID() == cid {
+				t.ThreadGroup().WaitExited()
+			}
+		}
+
+		// At this point, all processes inside of the container have exited,
+		// releasing all references to the container's MountNamespace and
+		// causing all submounts and overlays to be unmounted.
+		//
+		// Since the container's MountNamespace has been released,
+		// MountNamespace.destroy() will have executed, but that function may
+		// trigger async close operations. We must wait for those to complete
+		// before returning, otherwise the caller may kill the gofer before
+		// they complete, causing a cascade of failing RPCs.
+		fs.AsyncBarrier()
+	}
+
+	// No more failure from this point on. Remove all container thread groups
+	// from the map.
+	for key := range l.processes {
+		if key.cid == cid {
+			delete(l.processes, key)
+		}
+	}
+
+	log.Debugf("Container destroyed %q", cid)
+	return nil
+}
+
+func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
+	// Hold the lock for the entire operation to ensure that exec'd process is
+	// added to 'processes' in case it races with destroyContainer().
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
+	if err != nil {
+		return 0, err
+	}
+	if tg == nil {
+		return 0, fmt.Errorf("container %q not started", args.ContainerID)
+	}
+
+	// Get the container MountNamespace from the Task.
+	if kernel.VFS2Enabled {
+		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
+		args.MountNamespaceVFS2.IncRef()
+	} else {
+		tg.Leader().WithMuLocked(func(t *kernel.Task) {
+			// task.MountNamespace() does not take a ref, so we must do so ourselves.
+			args.MountNamespace = t.MountNamespace()
+			args.MountNamespace.IncRef()
+		})
+	}
+
+	// Add the HOME environment variable if it is not already set.
+	if kernel.VFS2Enabled {
+		defer args.MountNamespaceVFS2.DecRef()
+
+		root := args.MountNamespaceVFS2.Root()
+		defer root.DecRef()
+		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
+	} else {
+		defer args.MountNamespace.DecRef()
+
+		root := args.MountNamespace.Root()
+		defer root.DecRef()
+		ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+		envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
+	}
+
+	// Start the process.
+	proc := control.Proc{Kernel: l.k}
+	args.PIDNamespace = tg.PIDNamespace()
+	newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args)
+	if err != nil {
+		return 0, err
+	}
+
+	eid := execID{cid: args.ContainerID, pid: tgid}
+	l.processes[eid] = &execProcess{
+		tg:      newTG,
+		tty:     ttyFile,
+		ttyVFS2: ttyFileVFS2,
+	}
+	log.Debugf("updated processes: %v", l.processes)
+
+	return tgid, nil
+}
+
+// waitContainer waits for the init process of a container to exit.
+func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
+	// Don't defer unlock, as doing so would make it impossible for
+	// multiple clients to wait on the same container.
+	tg, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("can't wait for container %q: %v", cid, err)
+	}
+
+	// If the thread either has already exited or exits during waiting,
+	// consider the container exited.
+	ws := l.wait(tg)
+	*waitStatus = ws
+	return nil
+}
+
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
+	if tgid <= 0 {
+		return fmt.Errorf("PID (%d) must be positive", tgid)
+	}
+
+	// Try to find a process that was exec'd
+	eid := execID{cid: cid, pid: tgid}
+	execTG, err := l.threadGroupFromID(eid)
+	if err == nil {
+		ws := l.wait(execTG)
+		*waitStatus = ws
+
+		l.mu.Lock()
+		delete(l.processes, eid)
+		log.Debugf("updated processes (removal): %v", l.processes)
+		l.mu.Unlock()
+		return nil
+	}
+
+	// The caller may be waiting on a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace.
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("waiting for PID %d: %v", tgid, err)
+	}
+	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+	if tg == nil {
+		return fmt.Errorf("waiting for PID %d: no such process", tgid)
+	}
+	if tg.Leader().ContainerID() != cid {
+		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+	}
+	ws := l.wait(tg)
+	*waitStatus = ws
+	return nil
+}
+
+// wait waits for the process with TGID 'tgid' in a container's PID namespace
+// to exit.
+func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
+	tg.WaitExited()
+	return tg.ExitStatus().Status()
+}
+
+// WaitForStartSignal waits for a start signal from the control server.
+func (l *Loader) WaitForStartSignal() {
+	<-l.ctrl.manager.startChan
+}
+
+// WaitExit waits for the root container to exit, and returns its exit status.
+func (l *Loader) WaitExit() kernel.ExitStatus {
+	// Wait for container.
+	l.k.WaitExited()
+
+	return l.k.GlobalInit().ExitStatus()
+}
+
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
+	switch conf.Network {
+	case NetworkHost:
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
+
+	case NetworkNone, NetworkSandbox:
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
+		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
+		}
+		return inet.NewRootNamespace(s, creator), nil
+
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
+
+}
+
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
+
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %s", err)
+	}
+
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
+	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
+	}
+
+	return s, nil
+}
+
+// signal sends a signal to one or more processes in a container. If PID is 0,
+// then the container init process is used. Depending on the SignalDeliveryMode
+// option, the signal may be sent directly to the indicated process, to all
+// processes in the container, or to the foreground process group.
+func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
+	if pid < 0 {
+		return fmt.Errorf("PID (%d) must be positive", pid)
+	}
+
+	switch mode {
+	case DeliverToProcess:
+		if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
+			return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err)
+		}
+		return nil
+
+	case DeliverToForegroundProcessGroup:
+		if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
+			return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err)
+		}
+		return nil
+
+	case DeliverToAllProcesses:
+		if pid != 0 {
+			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
+		}
+		// Check that the container has actually started before signaling it.
+		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
+			return err
+		}
+		if err := l.signalAllProcesses(cid, signo); err != nil {
+			return fmt.Errorf("signaling all processes in container %q: %v", cid, err)
+		}
+		return nil
+
+	default:
+		panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
+	}
+}
+
+func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
+	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	if err == nil {
+		// Send signal directly to the identified process.
+		return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
+	}
+
+	// The caller may be signaling a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace and
+	// signal it.
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+	if tg == nil {
+		return fmt.Errorf("no such process with PID %d", tgid)
+	}
+	if tg.Leader().ContainerID() != cid {
+		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+	}
+	return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
+}
+
+// signalForegrondProcessGroup looks up foreground process group from the TTY
+// for the given "tgid" inside container "cid", and send the signal to it.
+func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
+	l.mu.Lock()
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
+	if err != nil {
+		l.mu.Unlock()
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+	if tg == nil {
+		l.mu.Unlock()
+		return fmt.Errorf("container %q not started", cid)
+	}
+
+	tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
+	l.mu.Unlock()
+	if err != nil {
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+
+	var pg *kernel.ProcessGroup
+	switch {
+	case ttyVFS2 != nil:
+		pg = ttyVFS2.ForegroundProcessGroup()
+	case tty != nil:
+		pg = tty.ForegroundProcessGroup()
+	default:
+		return fmt.Errorf("no TTY attached")
+	}
+	if pg == nil {
+		// No foreground process group has been set. Signal the
+		// original thread group.
+		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
+		return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
+	}
+	// Send the signal to all processes in the process group.
+	var lastErr error
+	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+		if tg.ProcessGroup() != pg {
+			continue
+		}
+		if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil {
+			lastErr = err
+		}
+	}
+	return lastErr
+}
+
+// signalAllProcesses that belong to specified container. It's a noop if the
+// container hasn't started or has exited.
+func (l *Loader) signalAllProcesses(cid string, signo int32) error {
+	// Pause the kernel to prevent new processes from being created while
+	// the signal is delivered. This prevents process leaks when SIGKILL is
+	// sent to the entire container.
+	l.k.Pause()
+	defer l.k.Unpause()
+	return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
+}
+
+// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
+// acquires mutex before calling it and fails in case container hasn't started
+// yet.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	tg, err := l.tryThreadGroupFromIDLocked(key)
+	if err != nil {
+		return nil, err
+	}
+	if tg == nil {
+		return nil, fmt.Errorf("container %q not started", key.cid)
+	}
+	return tg, nil
+}
+
+// tryThreadGroupFromIDLocked returns the thread group for the given execution
+// ID. It may return nil in case the container has not started yet. Returns
+// error if execution ID is invalid or if the container cannot be found (maybe
+// it has been deleted). Caller must hold 'mu'.
+func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, fmt.Errorf("container %q not found", key.cid)
+	}
+	return ep.tg, nil
+}
+
+// ttyFromIDLocked returns the TTY files for the given execution ID. It may
+// return nil in case the container has not started yet. Returns error if
+// execution ID is invalid or if the container cannot be found (maybe it has
+// been deleted). Caller must hold 'mu'.
+func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, nil, fmt.Errorf("container %q not found", key.cid)
+	}
+	return ep.tty, ep.ttyVFS2, nil
+}
+
+func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	if len(stdioFDs) != 3 {
+		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+	}
+
+	k := kernel.KernelFromContext(ctx)
+	fdTable := k.NewFDTable()
+	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
+	if err != nil {
+		fdTable.DecRef()
+		return nil, nil, nil, err
+	}
+	return fdTable, ttyFile, ttyFileVFS2, nil
+}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
new file mode 100644
index 000000000..b723e4335
--- /dev/null
+++ b/runsc/boot/loader_test.go
@@ -0,0 +1,715 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"math/rand"
+	"os"
+	"reflect"
+	"syscall"
+	"testing"
+	"time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/fsgofer"
+)
+
+func init() {
+	log.SetLevel(log.Debug)
+	rand.Seed(time.Now().UnixNano())
+	if err := fsgofer.OpenProcSelfFD(); err != nil {
+		panic(err)
+	}
+}
+
+func testConfig() *Config {
+	return &Config{
+		RootDir:        "unused_root_dir",
+		Network:        NetworkNone,
+		DisableSeccomp: true,
+		Platform:       "ptrace",
+	}
+}
+
+// testSpec returns a simple spec that can be used in tests.
+func testSpec() *specs.Spec {
+	return &specs.Spec{
+		// The host filesystem root is the sandbox root.
+		Root: &specs.Root{
+			Path:     "/",
+			Readonly: true,
+		},
+		Process: &specs.Process{
+			Args: []string{"/bin/true"},
+		},
+	}
+}
+
+// startGofer starts a new gofer routine serving 'root' path. It returns the
+// sandbox side of the connection, and a function that when called will stop the
+// gofer.
+func startGofer(root string) (int, func(), error) {
+	fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+	if err != nil {
+		return 0, nil, err
+	}
+	sandboxEnd, goferEnd := fds[0], fds[1]
+
+	socket, err := unet.NewSocket(goferEnd)
+	if err != nil {
+		syscall.Close(sandboxEnd)
+		syscall.Close(goferEnd)
+		return 0, nil, fmt.Errorf("error creating server on FD %d: %v", goferEnd, err)
+	}
+	at, err := fsgofer.NewAttachPoint(root, fsgofer.Config{ROMount: true})
+	if err != nil {
+		return 0, nil, err
+	}
+	go func() {
+		s := p9.NewServer(at)
+		if err := s.Handle(socket); err != nil {
+			log.Infof("Gofer is stopping. FD: %d, err: %v\n", goferEnd, err)
+		}
+	}()
+	// Closing the gofer socket will stop the gofer and exit goroutine above.
+	cleanup := func() {
+		if err := socket.Close(); err != nil {
+			log.Warningf("Error closing gofer socket: %v", err)
+		}
+	}
+	return sandboxEnd, cleanup, nil
+}
+
+func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) {
+	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
+	if err != nil {
+		return nil, nil, err
+	}
+	conf := testConfig()
+	conf.VFS2 = vfsEnabled
+
+	sandEnd, cleanup, err := startGofer(spec.Root.Path)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Loader takes ownership of stdio.
+	var stdio []int
+	for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
+		newFd, err := unix.Dup(int(f.Fd()))
+		if err != nil {
+			return nil, nil, err
+		}
+		stdio = append(stdio, newFd)
+	}
+
+	args := Args{
+		ID:           "foo",
+		Spec:         spec,
+		Conf:         conf,
+		ControllerFD: fd,
+		GoferFDs:     []int{sandEnd},
+		StdioFDs:     stdio,
+	}
+	l, err := New(args)
+	if err != nil {
+		cleanup()
+		return nil, nil, err
+	}
+	return l, cleanup, nil
+}
+
+// TestRun runs a simple application in a sandbox and checks that it succeeds.
+func TestRun(t *testing.T) {
+	doRun(t, false)
+}
+
+// TestRunVFS2 runs TestRun in VFSv2.
+func TestRunVFS2(t *testing.T) {
+	doRun(t, true)
+}
+
+func doRun(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
+	if err != nil {
+		t.Fatalf("error creating loader: %v", err)
+	}
+
+	defer l.Destroy()
+	defer cleanup()
+
+	// Start a goroutine to read the start chan result, otherwise Run will
+	// block forever.
+	var resultChanErr error
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		resultChanErr = <-l.ctrl.manager.startResultChan
+		wg.Done()
+	}()
+
+	// Run the container.
+	if err := l.Run(); err != nil {
+		t.Errorf("error running container: %v", err)
+	}
+
+	// We should have not gotten an error on the startResultChan.
+	wg.Wait()
+	if resultChanErr != nil {
+		t.Errorf("error on startResultChan: %v", resultChanErr)
+	}
+
+	// Wait for the application to exit.  It should succeed.
+	if status := l.WaitExit(); status.Code != 0 || status.Signo != 0 {
+		t.Errorf("application exited with status %+v, want 0", status)
+	}
+}
+
+// TestStartSignal tests that the controller Start message will cause
+// WaitForStartSignal to return.
+func TestStartSignal(t *testing.T) {
+	doStartSignal(t, false)
+}
+
+// TestStartSignalVFS2 does TestStartSignal with VFS2.
+func TestStartSignalVFS2(t *testing.T) {
+	doStartSignal(t, true)
+}
+
+func doStartSignal(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
+	if err != nil {
+		t.Fatalf("error creating loader: %v", err)
+	}
+	defer l.Destroy()
+	defer cleanup()
+
+	// We aren't going to wait on this application, so the control server
+	// needs to be shut down manually.
+	defer l.ctrl.srv.Stop()
+
+	// Start a goroutine that calls WaitForStartSignal and writes to a
+	// channel when it returns.
+	waitFinished := make(chan struct{})
+	go func() {
+		l.WaitForStartSignal()
+		// Pretend that Run() executed and returned no error.
+		l.ctrl.manager.startResultChan <- nil
+		waitFinished <- struct{}{}
+	}()
+
+	// Nothing has been written to the channel, so waitFinished should not
+	// return.  Give it a little bit of time to make sure the goroutine has
+	// started.
+	select {
+	case <-waitFinished:
+		t.Errorf("WaitForStartSignal completed but it should not have")
+	case <-time.After(50 * time.Millisecond):
+		// OK.
+	}
+
+	// Trigger the control server StartRoot method.
+	cid := "foo"
+	if err := l.ctrl.manager.StartRoot(&cid, nil); err != nil {
+		t.Errorf("error calling StartRoot: %v", err)
+	}
+
+	// Now WaitForStartSignal should return (within a short amount of
+	// time).
+	select {
+	case <-waitFinished:
+		// OK.
+	case <-time.After(50 * time.Millisecond):
+		t.Errorf("WaitForStartSignal did not complete but it should have")
+	}
+
+}
+
+type CreateMountTestcase struct {
+	name string
+	// Spec that will be used to create the mount manager.  Note
+	// that we can't mount procfs without a kernel, so each spec
+	// MUST contain something other than procfs mounted at /proc.
+	spec specs.Spec
+	// Paths that are expected to exist in the resulting fs.
+	expectedPaths []string
+}
+
+func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+	testCases := []*CreateMountTestcase{
+		&CreateMountTestcase{
+			// Only proc.
+			name: "only proc mount",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			// /proc, /dev, and /sys should always be mounted.
+			expectedPaths: []string{"/proc", "/dev", "/sys"},
+		},
+		{
+			// Mount at a deep path, with many components that do
+			// not exist in the root.
+			name: "deep mount path",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			// /some/deep/path should be mounted, along with /proc,
+			// /dev, and /sys.
+			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+		},
+		&CreateMountTestcase{
+			// Mounts are nested inside each other.
+			name: "nested mounts",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/qux",
+						Type:        "tmpfs",
+					},
+					{
+						// File mounts with the same prefix.
+						Destination: "/foo/qux-quz",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/bar",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/foo/bar/baz",
+						Type:        "tmpfs",
+					},
+					{
+						// A deep path that is in foo but not the other mounts.
+						Destination: "/foo/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
+				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
+		},
+		&CreateMountTestcase{
+			name: "mount inside /dev",
+			spec: specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev",
+						Type:        "tmpfs",
+					},
+					{
+						// Mounted by runsc by default.
+						Destination: "/dev/fd",
+						Type:        "tmpfs",
+					},
+					{
+						// Mount with the same prefix.
+						Destination: "/dev/fd-foo",
+						Type:        "tmpfs",
+					},
+					{
+						// Unsupported fs type.
+						Destination: "/dev/mqueue",
+						Type:        "mqueue",
+					},
+					{
+						Destination: "/dev/foo",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/dev/bar",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
+		},
+	}
+
+	vfsCase := &CreateMountTestcase{
+		name: "mounts inside mandatory mounts",
+		spec: specs.Spec{
+			Root: &specs.Root{
+				Path:     os.TempDir(),
+				Readonly: true,
+			},
+			Mounts: []specs.Mount{
+				{
+					Destination: "/proc",
+					Type:        "tmpfs",
+				},
+				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
+				//  MkDirAt in VFS2 (and remove the reduntant append).
+				// {
+				//		Destination: "/sys/bar",
+				//		Type:        "tmpfs",
+				//	},
+				//
+				{
+					Destination: "/tmp/baz",
+					Type:        "tmpfs",
+				},
+			},
+		},
+		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
+	}
+
+	if !vfs2 {
+		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
+		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
+	}
+	return append(testCases, vfsCase)
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespace(t *testing.T) {
+	for _, tc := range createMountTestcases(false /* vfs2 */) {
+		t.Run(tc.name, func(t *testing.T) {
+			conf := testConfig()
+			ctx := contexttest.Context(t)
+
+			sandEnd, cleanup, err := startGofer(tc.spec.Root.Path)
+			if err != nil {
+				t.Fatalf("failed to create gofer: %v", err)
+			}
+			defer cleanup()
+
+			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+			mns, err := mntr.createMountNamespace(ctx, conf)
+			if err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
+			}
+			ctx = fs.WithRoot(ctx, mns.Root())
+			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
+			}
+
+			root := mns.Root()
+			defer root.DecRef()
+			for _, p := range tc.expectedPaths {
+				maxTraversals := uint(0)
+				if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
+					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				} else {
+					d.DecRef()
+				}
+			}
+		})
+	}
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespaceVFS2(t *testing.T) {
+	for _, tc := range createMountTestcases(true /* vfs2 */) {
+		t.Run(tc.name, func(t *testing.T) {
+			spec := testSpec()
+			spec.Mounts = tc.spec.Mounts
+			spec.Root = tc.spec.Root
+
+			t.Logf("Using root: %q", spec.Root.Path)
+			l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec)
+			if err != nil {
+				t.Fatalf("failed to create loader: %v", err)
+			}
+			defer l.Destroy()
+			defer loaderCleanup()
+
+			mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+			if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil {
+				t.Fatalf("failed process hints: %v", err)
+			}
+
+			ctx := l.k.SupervisorContext()
+			mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+			if err != nil {
+				t.Fatalf("failed to setupVFS2: %v", err)
+			}
+
+			root := mns.Root()
+			defer root.DecRef()
+			for _, p := range tc.expectedPaths {
+				target := &vfs.PathOperation{
+					Root:  root,
+					Start: root,
+					Path:  fspath.Parse(p),
+				}
+
+				if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				} else {
+					d.DecRef()
+				}
+			}
+		})
+	}
+}
+
+// TestRestoreEnvironment tests that the correct mounts are collected from the spec and config
+// in order to build the environment for restoring.
+func TestRestoreEnvironment(t *testing.T) {
+	testCases := []struct {
+		name          string
+		spec          *specs.Spec
+		ioFDs         []int
+		errorExpected bool
+		expectedRenv  fs.RestoreEnvironment
+	}{
+		{
+			name: "basic spec test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/some/very/very/deep/path",
+						Type:        "tmpfs",
+					},
+					{
+						Destination: "/proc",
+						Type:        "tmpfs",
+					},
+				},
+			},
+			ioFDs:         []int{0},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+						},
+					},
+					"tmpfs": {
+						{
+							Dev: "none",
+						},
+						{
+							Dev: "none",
+						},
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "bind type test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/dev/fd-foo",
+						Type:        "bind",
+					},
+				},
+			},
+			ioFDs:         []int{0, 1},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+						},
+						{
+							Dev:        "9pfs-/dev/fd-foo",
+							DataString: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating",
+						},
+					},
+					"tmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"proc": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "options test",
+			spec: &specs.Spec{
+				Root: &specs.Root{
+					Path:     os.TempDir(),
+					Readonly: true,
+				},
+				Mounts: []specs.Mount{
+					{
+						Destination: "/dev/fd-foo",
+						Type:        "tmpfs",
+						Options:     []string{"uid=1022", "noatime"},
+					},
+				},
+			},
+			ioFDs:         []int{0},
+			errorExpected: false,
+			expectedRenv: fs.RestoreEnvironment{
+				MountSources: map[string][]fs.MountArgs{
+					"9p": {
+						{
+							Dev:        "9pfs-/",
+							Flags:      fs.MountSourceFlags{ReadOnly: true},
+							DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+						},
+					},
+					"tmpfs": {
+						{
+							Dev:        "none",
+							Flags:      fs.MountSourceFlags{NoAtime: true},
+							DataString: "uid=1022",
+						},
+						{
+							Dev: "none",
+						},
+					},
+					"devtmpfs": {
+						{
+							Dev: "none",
+						},
+					},
+					"devpts": {
+						{
+							Dev: "none",
+						},
+					},
+					"proc": {
+						{
+							Dev: "none",
+						},
+					},
+					"sysfs": {
+						{
+							Dev: "none",
+						},
+					},
+				},
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			conf := testConfig()
+			mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+			actualRenv, err := mntr.createRestoreEnvironment(conf)
+			if !tc.errorExpected && err != nil {
+				t.Fatalf("could not create restore environment for test:%s", tc.name)
+			} else if tc.errorExpected {
+				if err == nil {
+					t.Errorf("expected an error, but no error occurred.")
+				}
+			} else {
+				if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) {
+					t.Errorf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv)
+				}
+			}
+		})
+	}
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
new file mode 100644
index 000000000..14d2f56a5
--- /dev/null
+++ b/runsc/boot/network.go
@@ -0,0 +1,341 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"net"
+	"runtime"
+	"strings"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/urpc"
+)
+
+var (
+	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
+	// "::1/8" on "lo" interface.
+	DefaultLoopbackLink = LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []Route{
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv4(0x7f, 0, 0, 0),
+					Mask: net.IPv4Mask(0xff, 0, 0, 0),
+				},
+			},
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv6loopback,
+					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+				},
+			},
+		},
+	}
+)
+
+// Network exposes methods that can be used to configure a network stack.
+type Network struct {
+	Stack *stack.Stack
+}
+
+// Route represents a route in the network stack.
+type Route struct {
+	Destination net.IPNet
+	Gateway     net.IP
+}
+
+// DefaultRoute represents a catch all route to the default gateway.
+type DefaultRoute struct {
+	Route Route
+	Name  string
+}
+
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+	// QDiscNone disables any queueing for the underlying FD.
+	QDiscNone QueueingDiscipline = iota
+
+	// QDiscFIFO applies a simple fifo based queue to the underlying
+	// FD.
+	QDiscFIFO
+)
+
+// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
+// else returns an error.
+func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
+	switch s {
+	case "none":
+		return QDiscNone, nil
+	case "fifo":
+		return QDiscFIFO, nil
+	default:
+		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
+	}
+}
+
+// String implements fmt.Stringer.
+func (q QueueingDiscipline) String() string {
+	switch q {
+	case QDiscNone:
+		return "none"
+	case QDiscFIFO:
+		return "fifo"
+	default:
+		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
+	}
+}
+
+// FDBasedLink configures an fd-based link.
+type FDBasedLink struct {
+	Name               string
+	MTU                int
+	Addresses          []net.IP
+	Routes             []Route
+	GSOMaxSize         uint32
+	SoftwareGSOEnabled bool
+	TXChecksumOffload  bool
+	RXChecksumOffload  bool
+	LinkAddress        net.HardwareAddr
+	QDisc              QueueingDiscipline
+
+	// NumChannels controls how many underlying FD's are to be used to
+	// create this endpoint.
+	NumChannels int
+}
+
+// LoopbackLink configures a loopback li nk.
+type LoopbackLink struct {
+	Name      string
+	Addresses []net.IP
+	Routes    []Route
+}
+
+// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
+type CreateLinksAndRoutesArgs struct {
+	// FilePayload contains the fds associated with the FDBasedLinks. The
+	// number of fd's should match the sum of the NumChannels field of the
+	// FDBasedLink entries below.
+	urpc.FilePayload
+
+	LoopbackLinks []LoopbackLink
+	FDBasedLinks  []FDBasedLink
+
+	Defaultv4Gateway DefaultRoute
+	Defaultv6Gateway DefaultRoute
+}
+
+// Empty returns true if route hasn't been set.
+func (r *Route) Empty() bool {
+	return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil
+}
+
+func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) {
+	subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask))
+	if err != nil {
+		return tcpip.Route{}, err
+	}
+	return tcpip.Route{
+		Destination: subnet,
+		Gateway:     ipToAddress(r.Gateway),
+		NIC:         id,
+	}, nil
+}
+
+// CreateLinksAndRoutes creates links and routes in a network stack.  It should
+// only be called once.
+func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
+	wantFDs := 0
+	for _, l := range args.FDBasedLinks {
+		wantFDs += l.NumChannels
+	}
+	if got := len(args.FilePayload.Files); got != wantFDs {
+		return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs)
+	}
+
+	var nicID tcpip.NICID
+	nicids := make(map[string]tcpip.NICID)
+
+	// Collect routes from all links.
+	var routes []tcpip.Route
+
+	// Loopback normally appear before other interfaces.
+	for _, link := range args.LoopbackLinks {
+		nicID++
+		nicids[link.Name] = nicID
+
+		linkEP := loopback.New()
+
+		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+			return err
+		}
+
+		// Collect the routes from this link.
+		for _, r := range link.Routes {
+			route, err := r.toTcpipRoute(nicID)
+			if err != nil {
+				return err
+			}
+			routes = append(routes, route)
+		}
+	}
+
+	fdOffset := 0
+	for _, link := range args.FDBasedLinks {
+		nicID++
+		nicids[link.Name] = nicID
+
+		FDs := []int{}
+		for j := 0; j < link.NumChannels; j++ {
+			// Copy the underlying FD.
+			oldFD := args.FilePayload.Files[fdOffset].Fd()
+			newFD, err := syscall.Dup(int(oldFD))
+			if err != nil {
+				return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+			}
+			FDs = append(FDs, newFD)
+			fdOffset++
+		}
+
+		mac := tcpip.LinkAddress(link.LinkAddress)
+		log.Infof("gso max size is: %d", link.GSOMaxSize)
+
+		linkEP, err := fdbased.New(&fdbased.Options{
+			FDs:                FDs,
+			MTU:                uint32(link.MTU),
+			EthernetHeader:     true,
+			Address:            mac,
+			PacketDispatchMode: fdbased.RecvMMsg,
+			GSOMaxSize:         link.GSOMaxSize,
+			SoftwareGSOEnabled: link.SoftwareGSOEnabled,
+			TXChecksumOffload:  link.TXChecksumOffload,
+			RXChecksumOffload:  link.RXChecksumOffload,
+		})
+		if err != nil {
+			return err
+		}
+
+		switch link.QDisc {
+		case QDiscNone:
+		case QDiscFIFO:
+			log.Infof("Enabling FIFO QDisc on %q", link.Name)
+			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
+		}
+
+		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+			return err
+		}
+
+		// Collect the routes from this link.
+		for _, r := range link.Routes {
+			route, err := r.toTcpipRoute(nicID)
+			if err != nil {
+				return err
+			}
+			routes = append(routes, route)
+		}
+	}
+
+	if !args.Defaultv4Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv4Gateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
+		}
+		route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
+	}
+
+	if !args.Defaultv6Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv6Gateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
+		}
+		route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
+	}
+
+	log.Infof("Setting routes %+v", routes)
+	n.Stack.SetRouteTable(routes)
+	return nil
+}
+
+// createNICWithAddrs creates a NIC in the network stack and adds the given
+// addresses.
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error {
+	opts := stack.NICOptions{Name: name}
+	if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil {
+		return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
+	}
+
+	// Always start with an arp address for the NIC.
+	if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+		return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
+	}
+
+	for _, addr := range addrs {
+		proto, tcpipAddr := ipToAddressAndProto(addr)
+		if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+			return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
+		}
+	}
+	return nil
+}
+
+// ipToAddressAndProto converts IP to tcpip.Address and a protocol number.
+//
+// Note: don't use 'len(ip)' to determine IP version because length is always 16.
+func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) {
+	if i4 := ip.To4(); i4 != nil {
+		return ipv4.ProtocolNumber, tcpip.Address(i4)
+	}
+	return ipv6.ProtocolNumber, tcpip.Address(ip)
+}
+
+// ipToAddress converts IP to tcpip.Address, ignoring the protocol.
+func ipToAddress(ip net.IP) tcpip.Address {
+	_, addr := ipToAddressAndProto(ip)
+	return addr
+}
+
+// ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the
+// protocol.
+func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask {
+	return tcpip.AddressMask(ipToAddress(net.IP(ipMask)))
+}
diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD
new file mode 100644
index 000000000..77774f43c
--- /dev/null
+++ b/runsc/boot/platforms/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "platforms",
+    srcs = ["platforms.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+    deps = [
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+    ],
+)
diff --git a/runsc/boot/platforms/platforms.go b/runsc/boot/platforms/platforms.go
new file mode 100644
index 000000000..056b46ad5
--- /dev/null
+++ b/runsc/boot/platforms/platforms.go
@@ -0,0 +1,30 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package platforms imports all available platform packages.
+package platforms
+
+import (
+	// Import platforms that runsc might use.
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+)
+
+const (
+	// Ptrace runs the sandbox with the ptrace platform.
+	Ptrace = "ptrace"
+
+	// KVM runs the sandbox with the KVM platform.
+	KVM = "kvm"
+)
diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD
new file mode 100644
index 000000000..29cb42b2f
--- /dev/null
+++ b/runsc/boot/pprof/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pprof",
+    srcs = ["pprof.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+)
diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go
new file mode 100644
index 000000000..1ded20dee
--- /dev/null
+++ b/runsc/boot/pprof/pprof.go
@@ -0,0 +1,20 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pprof provides a stub to initialize custom profilers.
+package pprof
+
+// Initialize will be called at boot for initializing custom profilers.
+func Initialize() {
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
new file mode 100644
index 000000000..fbfd3b07c
--- /dev/null
+++ b/runsc/boot/strace.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+func enableStrace(conf *Config) error {
+	// We must initialize even if strace is not enabled.
+	strace.Initialize()
+
+	if !conf.Strace {
+		return nil
+	}
+
+	max := conf.StraceLogSize
+	if max == 0 {
+		max = 1024
+	}
+	strace.LogMaximumSize = max
+
+	if len(conf.StraceSyscalls) == 0 {
+		strace.EnableAll(strace.SinkTypeLog)
+		return nil
+	}
+	return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
new file mode 100644
index 000000000..6ee6fae04
--- /dev/null
+++ b/runsc/boot/vfs.go
@@ -0,0 +1,482 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"path"
+	"sort"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+	"gvisor.dev/gvisor/pkg/sentry/devices/ttydev"
+	"gvisor.dev/gvisor/pkg/sentry/devices/tundev"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func registerFilesystems(k *kernel.Kernel) error {
+	ctx := k.SupervisorContext()
+	creds := auth.NewRootCredentials(k.RootUserNamespace())
+	vfsObj := k.VFS()
+
+	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+		// TODO(b/29356795): Users may mount this once the terminals are in a
+		//  usable state.
+		AllowUserMount: false,
+	})
+	vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+	})
+	vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	// Setup files in devtmpfs.
+	if err := memdev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering memdev: %w", err)
+	}
+	if err := ttydev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering ttydev: %w", err)
+	}
+
+	if err := fuse.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering fusedev: %w", err)
+	}
+	if err := tundev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering tundev: %v", err)
+	}
+	a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
+	if err != nil {
+		return fmt.Errorf("creating devtmpfs accessor: %w", err)
+	}
+	defer a.Release()
+
+	if err := a.UserspaceInit(ctx); err != nil {
+		return fmt.Errorf("initializing userspace: %w", err)
+	}
+	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating memdev devtmpfs files: %w", err)
+	}
+	if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
+	}
+	if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+	}
+	if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
+		return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+	}
+	return nil
+}
+
+func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+	if err != nil {
+		return fmt.Errorf("failed to setupFS: %w", err)
+	}
+	procArgs.MountNamespaceVFS2 = mns
+
+	// Resolve the executable path from working dir and environment.
+	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
+	if err != nil {
+		return err
+	}
+	procArgs.Filename = resolved
+	return nil
+}
+
+func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+	log.Infof("Configuring container's file system with VFS2")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = rootCreds
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := procArgs.NewContext(c.k)
+
+	mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount namespace: %w", err)
+	}
+	rootProcArgs.MountNamespaceVFS2 = mns
+
+	// Mount submounts.
+	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
+		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+	fd := c.fds.remove()
+	opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
+
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
+	if err != nil {
+		return nil, fmt.Errorf("setting up mount namespace: %w", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+	mounts, err := c.prepareMountsVFS2()
+	if err != nil {
+		return err
+	}
+
+	for i := range mounts {
+		submount := &mounts[i]
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+			}
+		}
+	}
+
+	if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
+		return fmt.Errorf(`mount submount "\tmp": %w`, err)
+	}
+	return nil
+}
+
+type mountAndFD struct {
+	specs.Mount
+	fd int
+}
+
+func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
+	// Associate bind mounts with their FDs before sorting since there is an
+	// undocumented assumption that FDs are dispensed in the order in which
+	// they are required by mounts.
+	var mounts []mountAndFD
+	for _, m := range c.mounts {
+		fd := -1
+		// Only bind mounts use host FDs; see
+		// containerMounter.getMountNameAndOptionsVFS2.
+		if m.Type == bind {
+			fd = c.fds.remove()
+		}
+		mounts = append(mounts, mountAndFD{
+			Mount: m,
+			fd:    fd,
+		})
+	}
+	if err := c.checkDispenser(); err != nil {
+		return nil, err
+	}
+
+	// Sort the mounts so that we don't place children before parents.
+	sort.Slice(mounts, func(i, j int) bool {
+		return len(mounts[i].Destination) < len(mounts[j].Destination)
+	})
+
+	return mounts, nil
+}
+
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
+	root := mns.Root()
+	defer root.DecRef()
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(submount.Destination),
+	}
+	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+	if err != nil {
+		return fmt.Errorf("mountOptions failed: %w", err)
+	}
+	if len(fsName) == 0 {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
+		return err
+	}
+	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
+		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+	}
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
+	return nil
+}
+
+// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+	fsName := m.Type
+	var data []string
+
+	// Find filesystem name and FS specific data field.
+	switch m.Type {
+	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
+		// Nothing to do.
+
+	case nonefs:
+		fsName = sys.Name
+
+	case tmpfs.Name:
+		var err error
+		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
+		if err != nil {
+			return "", nil, err
+		}
+
+	case bind:
+		fsName = gofer.Name
+		if m.fd == 0 {
+			// Check that an FD was provided to fails fast. Technically FD=0 is valid,
+			// but unlikely to be correct in this context.
+			return "", nil, fmt.Errorf("9P mount requires a connection FD")
+		}
+		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+
+	default:
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+		return "", nil, nil
+	}
+
+	opts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+		},
+		InternalMount: true,
+	}
+
+	for _, o := range m.Options {
+		switch o {
+		case "rw":
+			opts.ReadOnly = false
+		case "ro":
+			opts.ReadOnly = true
+		case "noatime":
+			opts.Flags.NoATime = true
+		case "noexec":
+			opts.Flags.NoExec = true
+		default:
+			log.Warningf("ignoring unknown mount option %q", o)
+		}
+	}
+
+	if conf.Overlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		opts.ReadOnly = true
+	}
+	return fsName, opts, nil
+}
+
+func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(currentPath),
+	}
+	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
+	if err == nil {
+		log.Debugf("Mount point %q already exists", currentPath)
+		return nil
+	}
+	if err != syserror.ENOENT {
+		return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
+	}
+
+	// Recurse to ensure parent is created and then create the mount point.
+	if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
+		return err
+	}
+	log.Debugf("Creating dir %q for mount point", currentPath)
+	mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+	if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
+	}
+	return nil
+}
+
+// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+//   1. /tmp is mounted explicitly: we should not override user's wish
+//   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+	for _, m := range c.mounts {
+		// m.Destination has been cleaned, so it's to use equality here.
+		if m.Destination == "/tmp" {
+			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
+			return nil
+		}
+	}
+
+	root := mns.Root()
+	defer root.DecRef()
+	pop := vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse("/tmp"),
+	}
+	// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
+	statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{})
+	switch err {
+	case nil:
+		// Found '/tmp' in filesystem, check if it's empty.
+		if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory {
+			// Not a dir?! Leave it be.
+			return nil
+		}
+		if statx.Nlink > 2 {
+			// If more than "." and ".." is found, skip internal tmpfs to prevent
+			// hiding existing files.
+			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
+			return nil
+		}
+		log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+		fallthrough
+
+	case syserror.ENOENT:
+		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
+		// tmpfs.
+		tmpMount := specs.Mount{
+			Type:        tmpfs.Name,
+			Destination: "/tmp",
+			// Sticky bit is added to prevent accidental deletion of files from
+			// another user. This is normally done for /tmp.
+			Options: []string{"mode=01777"},
+		}
+		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+
+	default:
+		return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
+	}
+}
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error {
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs.Name {
+			continue
+		}
+
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.vfsMount = mnt
+	}
+	return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	mntFD := &mountAndFD{Mount: hint.mount}
+	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+	if err := source.checkCompatible(mount); err != nil {
+		return err
+	}
+
+	_, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+	if err != nil {
+		return err
+	}
+	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+	if err != nil {
+		return err
+	}
+	defer newMnt.DecRef()
+
+	root := mns.Root()
+	defer root.DecRef()
+	if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil {
+		return err
+	}
+
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(mount.Destination),
+	}
+	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+		return err
+	}
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return nil
+}