Merge branch 'master' into ip-forwarding

- Merges aleksej-paschenko's with HEAD - Adds vfs2 support for ip_forward
author: Ian Lewis <ianmlewis@gmail.com> 2020-08-17 21:44:31 -0400
committer: Ian Lewis <ianmlewis@gmail.com> 2020-08-17 21:44:31 -0400
commit: ac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree: 0cbc5018e8807421d701d190dc20525726c7ca76 /runsc/boot
parent: 352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent: 6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)
26 files changed, 1965 insertions, 1080 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 6fe2b57de..9f52438c2 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -7,38 +7,42 @@ go_library(
     srcs = [
         "compat.go",
         "compat_amd64.go",
+        "compat_arm64.go",
         "config.go",
         "controller.go",
         "debug.go",
         "events.go",
-        "fds.go",
         "fs.go",
         "limits.go",
         "loader.go",
         "network.go",
-        "pprof.go",
         "strace.go",
-        "user.go",
+        "vfs.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/boot",
     visibility = [
+        "//pkg/test:__subpackages__",
         "//runsc:__subpackages__",
         "//test:__subpackages__",
     ],
     deps = [
         "//pkg/abi",
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/control/server",
         "//pkg/cpuid",
         "//pkg/eventchannel",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
         "//pkg/rand",
         "//pkg/refs",
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
-        "//pkg/sentry/context",
         "//pkg/sentry/control",
+        "//pkg/sentry/devices/memdev",
+        "//pkg/sentry/devices/ttydev",
+        "//pkg/sentry/devices/tundev",
+        "//pkg/sentry/fdimport",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/dev",
         "//pkg/sentry/fs/gofer",
@@ -48,6 +52,16 @@ go_library(
         "//pkg/sentry/fs/sys",
         "//pkg/sentry/fs/tmpfs",
         "//pkg/sentry/fs/tty",
+        "//pkg/sentry/fs/user",
+        "//pkg/sentry/fsimpl/devpts",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/fuse",
+        "//pkg/sentry/fsimpl/gofer",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/fsimpl/overlay",
+        "//pkg/sentry/fsimpl/proc",
+        "//pkg/sentry/fsimpl/sys",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel:uncaught_signal_go_proto",
@@ -60,20 +74,24 @@ go_library(
         "//pkg/sentry/socket/hostinet",
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/netlink/uevent",
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
-        "//pkg/sentry/syscalls/linux",
+        "//pkg/sentry/syscalls/linux/vfs2",
         "//pkg/sentry/time",
         "//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
+        "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip",
         "//pkg/tcpip/link/fdbased",
         "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/packetsocket",
+        "//pkg/tcpip/link/qdisc/fifo",
         "//pkg/tcpip/link/sniffer",
         "//pkg/tcpip/network/arp",
         "//pkg/tcpip/network/ipv4",
@@ -86,9 +104,10 @@ go_library(
         "//pkg/urpc",
         "//runsc/boot/filter",
         "//runsc/boot/platforms",
+        "//runsc/boot/pprof",
         "//runsc/specutils",
         "@com_github_golang_protobuf//proto:go_default_library",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -100,19 +119,20 @@ go_test(
         "compat_test.go",
         "fs_test.go",
         "loader_test.go",
-        "user_test.go",
     ],
-    embed = [":boot"],
+    library = ":boot",
     deps = [
         "//pkg/control/server",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
-        "//pkg/sentry/arch:registers_go_proto",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fs",
-        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/unet",
         "//runsc/fsgofer",
-        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 07e35ab10..84c67cbc2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -17,18 +17,16 @@ package boot
 import (
 	"fmt"
 	"os"
-	"sync"
 	"syscall"
 
 	"github.com/golang/protobuf/proto"
-	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/strace"
 	spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 func initCompatLogs(fd int) error {
@@ -53,9 +51,9 @@ type compatEmitter struct {
 }
 
 func newCompatEmitter(logFD int) (*compatEmitter, error) {
-	nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+	nameMap, ok := getSyscallNameMap()
 	if !ok {
-		return nil, fmt.Errorf("amd64 Linux syscall table not found")
+		return nil, fmt.Errorf("Linux syscall table not found")
 	}
 
 	c := &compatEmitter{
@@ -67,7 +65,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) {
 
 	if logFD > 0 {
 		f := os.NewFile(uintptr(logFD), "user log file")
-		target := log.MultiEmitter{c.sink, log.K8sJSONEmitter{log.Writer{Next: f}}}
+		target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}}
 		c.sink = &log.BasicLogger{Level: log.Info, Emitter: target}
 	}
 	return c, nil
@@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
 }
 
 func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
-	regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+	regs := us.Registers
 
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	sysnr := regs.OrigRax
+	sysnr := syscallNum(regs)
 	tr := c.trackers[sysnr]
 	if tr == nil {
 		switch sysnr {
-		case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+		case syscall.SYS_PRCTL:
 			// args: cmd, ...
 			tr = newArgsTracker(0)
 
@@ -112,12 +110,22 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			tr = newArgsTracker(2)
 
 		default:
-			tr = &onceTracker{}
+			tr = newArchArgsTracker(sysnr)
+			if tr == nil {
+				tr = &onceTracker{}
+			}
 		}
 		c.trackers[sysnr] = tr
 	}
+
 	if tr.shouldReport(regs) {
-		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+		name := c.nameMap.Name(uintptr(sysnr))
+		c.sink.Infof("Unsupported syscall %s(%#x,%#x,%#x,%#x,%#x,%#x). It is "+
+			"likely that you can safely ignore this message and that this is not "+
+			"the cause of any error. Please, refer to %s/%s for more information.",
+			name, argVal(0, regs), argVal(1, regs), argVal(2, regs), argVal(3, regs),
+			argVal(4, regs), argVal(5, regs), syscallLink, name)
+
 		tr.onReported(regs)
 	}
 }
@@ -139,10 +147,10 @@ func (c *compatEmitter) Close() error {
 // the syscall and arguments.
 type syscallTracker interface {
 	// shouldReport returns true is the syscall should be reported.
-	shouldReport(regs *rpb.AMD64Registers) bool
+	shouldReport(regs *rpb.Registers) bool
 
 	// onReported marks the syscall as reported.
-	onReported(regs *rpb.AMD64Registers)
+	onReported(regs *rpb.Registers)
 }
 
 // onceTracker reports only a single time, used for most syscalls.
@@ -150,10 +158,45 @@ type onceTracker struct {
 	reported bool
 }
 
-func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+func (o *onceTracker) shouldReport(_ *rpb.Registers) bool {
 	return !o.reported
 }
 
-func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+func (o *onceTracker) onReported(_ *rpb.Registers) {
 	o.reported = true
 }
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+	// argsIdx is the syscall arguments to use as unique ID.
+	argsIdx  []int
+	reported map[string]struct{}
+	count    int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// key returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.Registers) string {
+	var rv string
+	for _, idx := range a.argsIdx {
+		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	}
+	return rv
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.Registers) bool {
+	if a.count >= reportLimit {
+		return false
+	}
+	_, ok := a.reported[a.key(regs)]
+	return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.Registers) {
+	a.count++
+	a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 43cd0db94..8eb76b2ba 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -16,62 +16,85 @@ package boot
 
 import (
 	"fmt"
+	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
 )
 
-// reportLimit is the max number of events that should be reported per tracker.
-const reportLimit = 100
+const (
+	// reportLimit is the max number of events that should be reported per
+	// tracker.
+	reportLimit = 100
+	syscallLink = "https://gvisor.dev/c/linux/amd64"
+)
 
-// argsTracker reports only once for each different combination of arguments.
-// It's used for generic syscalls like ioctl to report once per 'cmd'.
-type argsTracker struct {
-	// argsIdx is the syscall arguments to use as unique ID.
-	argsIdx  []int
-	reported map[string]struct{}
-	count    int
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Amd64{
+			Amd64: &rpb.AMD64Registers{},
+		},
+	}
 }
 
-func newArgsTracker(argIdx ...int) *argsTracker {
-	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
-}
+func argVal(argIdx int, regs *rpb.Registers) uint64 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
 
-// cmd returns the command based on the syscall argument index.
-func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
-	var rv string
-	for _, idx := range a.argsIdx {
-		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	switch argIdx {
+	case 0:
+		return amd64Regs.Rdi
+	case 1:
+		return amd64Regs.Rsi
+	case 2:
+		return amd64Regs.Rdx
+	case 3:
+		return amd64Regs.R10
+	case 4:
+		return amd64Regs.R8
+	case 5:
+		return amd64Regs.R9
 	}
-	return rv
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
 	switch argIdx {
 	case 0:
-		return uint32(regs.Rdi)
+		amd64Regs.Rdi = argVal
 	case 1:
-		return uint32(regs.Rsi)
+		amd64Regs.Rsi = argVal
 	case 2:
-		return uint32(regs.Rdx)
+		amd64Regs.Rdx = argVal
 	case 3:
-		return uint32(regs.R10)
+		amd64Regs.R10 = argVal
 	case 4:
-		return uint32(regs.R8)
+		amd64Regs.R8 = argVal
 	case 5:
-		return uint32(regs.R9)
+		amd64Regs.R9 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 	}
-	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
-	if a.count >= reportLimit {
-		return false
-	}
-	_, ok := a.reported[a.key(regs)]
-	return !ok
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.AMD64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+	return amd64Regs.OrigRax
 }
 
-func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
-	a.count++
-	a.reported[a.key(regs)] = struct{}{}
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	switch sysnr {
+	case syscall.SYS_ARCH_PRCTL:
+		// args: cmd, ...
+		return newArgsTracker(0)
+	}
+	return nil
 }
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
new file mode 100644
index 000000000..bce9d95b3
--- /dev/null
+++ b/runsc/boot/compat_arm64.go
@@ -0,0 +1,95 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+const (
+	// reportLimit is the max number of events that should be reported per
+	// tracker.
+	reportLimit = 100
+	syscallLink = "https://gvisor.dev/c/linux/arm64"
+)
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Arm64{
+			Arm64: &rpb.ARM64Registers{},
+		},
+	}
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint64 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		return arm64Regs.R0
+	case 1:
+		return arm64Regs.R1
+	case 2:
+		return arm64Regs.R2
+	case 3:
+		return arm64Regs.R3
+	case 4:
+		return arm64Regs.R4
+	case 5:
+		return arm64Regs.R5
+	}
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		arm64Regs.R0 = argVal
+	case 1:
+		arm64Regs.R1 = argVal
+	case 2:
+		arm64Regs.R2 = argVal
+	case 3:
+		arm64Regs.R3 = argVal
+	case 4:
+		arm64Regs.R4 = argVal
+	case 5:
+		arm64Regs.R5 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+	}
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.ARM64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+	return arm64Regs.R8
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	// currently, no arch specific syscalls need to be handled here.
+	return nil
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index 388298d8d..839c5303b 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -16,8 +16,6 @@ package boot
 
 import (
 	"testing"
-
-	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 )
 
 func TestOnceTracker(t *testing.T) {
@@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) {
 
 func TestArgsTracker(t *testing.T) {
 	for _, tc := range []struct {
-		name string
-		idx  []int
-		rdi1 uint64
-		rdi2 uint64
-		rsi1 uint64
-		rsi2 uint64
-		want bool
+		name   string
+		idx    []int
+		arg1_1 uint64
+		arg1_2 uint64
+		arg2_1 uint64
+		arg2_2 uint64
+		want   bool
 	}{
-		{name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false},
-		{name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false},
-		{name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true},
-		{name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true},
-		{name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
-		{name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false},
-		{name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true},
+		{name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false},
+		{name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false},
+		{name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true},
+		{name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true},
+		{name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false},
+		{name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false},
+		{name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			c := newArgsTracker(tc.idx...)
-			regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
+			regs := newRegs()
+			setArgVal(0, tc.arg1_1, regs)
+			setArgVal(1, tc.arg2_1, regs)
 			if !c.shouldReport(regs) {
 				t.Error("first call to shouldReport, got: false, want: true")
 			}
 			c.onReported(regs)
 
-			regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
+			setArgVal(0, tc.arg1_2, regs)
+			setArgVal(1, tc.arg2_2, regs)
 			if got := c.shouldReport(regs); tc.want != got {
 				t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
 			}
@@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) {
 func TestArgsTrackerLimit(t *testing.T) {
 	c := newArgsTracker(0, 1)
 	for i := 0; i < reportLimit; i++ {
-		regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)}
+		regs := newRegs()
+		setArgVal(0, 123, regs)
+		setArgVal(1, uint64(i), regs)
 		if !c.shouldReport(regs) {
 			t.Error("shouldReport before limit was reached, got: false, want: true")
 		}
@@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) {
 	}
 
 	// Should hit the count limit now.
-	regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456}
+	regs := newRegs()
+	setArgVal(0, 123, regs)
+	setArgVal(1, 123456, regs)
 	if c.shouldReport(regs) {
 		t.Error("shouldReport after limit was reached, got: true, want: false")
 	}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 72a33534f..80da8b3e6 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -158,6 +158,9 @@ type Config struct {
 	// DebugLog is the path to log debug information to, if not empty.
 	DebugLog string
 
+	// PanicLog is the path to log GO's runtime messages, if not empty.
+	PanicLog string
+
 	// DebugLogFormat is the log format for debug.
 	DebugLogFormat string
 
@@ -184,6 +187,16 @@ type Config struct {
 	// SoftwareGSO indicates that software segmentation offload is enabled.
 	SoftwareGSO bool
 
+	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
+	TXChecksumOffload bool
+
+	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
+	RXChecksumOffload bool
+
+	// QDisc indicates the type of queuening discipline to use by default
+	// for non-loopback interfaces.
+	QDisc QueueingDiscipline
+
 	// LogPackets indicates that all network packets should be logged.
 	LogPackets bool
 
@@ -234,8 +247,10 @@ type Config struct {
 	// ReferenceLeakMode sets reference leak check mode
 	ReferenceLeakMode refs.LeakMode
 
-	// OverlayfsStaleRead causes cached FDs to reopen after a file is opened for
-	// write to workaround overlayfs limitation on kernels before 4.19.
+	// OverlayfsStaleRead instructs the sandbox to assume that the root mount
+	// is on a Linux overlayfs mount, which does not necessarily preserve
+	// coherence between read-only and subsequent writable file descriptors
+	// representing the "same" file.
 	OverlayfsStaleRead bool
 
 	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
@@ -250,6 +265,18 @@ type Config struct {
 	// multiple tests are run in parallel, since there is no way to pass
 	// parameters to the runtime from docker.
 	TestOnlyTestNameEnv string
+
+	// CPUNumFromQuota sets CPU number count to available CPU quota, using
+	// least integer value greater than or equal to quota.
+	//
+	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+	CPUNumFromQuota bool
+
+	// Enables VFS2 (not plumbled through yet).
+	VFS2 bool
+
+	// Enables FUSE usage (not plumbled through yet).
+	FUSE bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -260,6 +287,7 @@ func (c *Config) ToFlags() []string {
 		"--log=" + c.LogFilename,
 		"--log-format=" + c.LogFormat,
 		"--debug-log=" + c.DebugLog,
+		"--panic-log=" + c.PanicLog,
 		"--debug-log-format=" + c.DebugLogFormat,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
@@ -280,7 +308,13 @@ func (c *Config) ToFlags() []string {
 		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
 		"--gso=" + strconv.FormatBool(c.HardwareGSO),
 		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
+		"--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
+		"--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
 		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
+		"--qdisc=" + c.QDisc.String(),
+	}
+	if c.CPUNumFromQuota {
+		f = append(f, "--cpu-num-from-quota")
 	}
 	// Only include these if set since it is never to be used by users.
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
@@ -289,5 +323,14 @@ func (c *Config) ToFlags() []string {
 	if len(c.TestOnlyTestNameEnv) != 0 {
 		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
 	}
+
+	if c.VFS2 {
+		f = append(f, "--vfs2=true")
+	}
+
+	if c.FUSE {
+		f = append(f, "--fuse=true")
+	}
+
 	return f
 }
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 5f644b57e..626a3816e 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
@@ -51,7 +52,7 @@ const (
 	ContainerEvent = "containerManager.Event"
 
 	// ContainerExecuteAsync is the URPC endpoint for executing a command in a
-	// container..
+	// container.
 	ContainerExecuteAsync = "containerManager.ExecuteAsync"
 
 	// ContainerPause pauses the container.
@@ -103,6 +104,8 @@ const (
 	StartCPUProfile = "Profile.StartCPUProfile"
 	StopCPUProfile  = "Profile.StopCPUProfile"
 	HeapProfile     = "Profile.HeapProfile"
+	BlockProfile    = "Profile.BlockProfile"
+	MutexProfile    = "Profile.MutexProfile"
 	StartTrace      = "Profile.StartTrace"
 	StopTrace       = "Profile.StopTrace"
 )
@@ -125,43 +128,55 @@ type controller struct {
 
 	// manager holds the containerManager methods.
 	manager *containerManager
+
+	// pprop holds the profile instance if enabled. It may be nil.
+	pprof *control.Profile
 }
 
 // newController creates a new controller. The caller must call
 // controller.srv.StartServing() to start the controller.
 func newController(fd int, l *Loader) (*controller, error) {
-	srv, err := server.CreateFromFD(fd)
+	ctrl := &controller{}
+	var err error
+	ctrl.srv, err = server.CreateFromFD(fd)
 	if err != nil {
 		return nil, err
 	}
 
-	manager := &containerManager{
+	ctrl.manager = &containerManager{
 		startChan:       make(chan struct{}),
 		startResultChan: make(chan error),
 		l:               l,
 	}
-	srv.Register(manager)
+	ctrl.srv.Register(ctrl.manager)
 
-	if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
+	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
-		srv.Register(net)
+		ctrl.srv.Register(net)
 	}
 
-	srv.Register(&debug{})
-	srv.Register(&control.Logging{})
-	if l.conf.ProfileEnable {
-		srv.Register(&control.Profile{})
+	ctrl.srv.Register(&debug{})
+	ctrl.srv.Register(&control.Logging{})
+
+	if l.root.conf.ProfileEnable {
+		ctrl.pprof = &control.Profile{Kernel: l.k}
+		ctrl.srv.Register(ctrl.pprof)
 	}
 
-	return &controller{
-		srv:     srv,
-		manager: manager,
-	}, nil
+	return ctrl, nil
+}
+
+func (c *controller) stop() {
+	if c.pprof != nil {
+		// These are noop if there is nothing being profiled.
+		_ = c.pprof.StopCPUProfile(nil, nil)
+		_ = c.pprof.StopTrace(nil, nil)
+	}
 }
 
-// containerManager manages sandboes containers.
+// containerManager manages sandbox containers.
 type containerManager struct {
 	// startChan is used to signal when the root container process should
 	// be started.
@@ -327,7 +342,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	// Pause the kernel while we build a new one.
 	cm.l.k.Pause()
 
-	p, err := createPlatform(cm.l.conf, deviceFile)
+	p, err := createPlatform(cm.l.root.conf, deviceFile)
 	if err != nil {
 		return fmt.Errorf("creating platform: %v", err)
 	}
@@ -339,12 +354,12 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating memory file: %v", err)
 	}
 	k.SetMemoryFile(mf)
-	networkStack := cm.l.k.NetworkStack()
+	networkStack := cm.l.k.RootNetworkNamespace().Stack()
 	cm.l.k = k
 
 	// Set up the restore environment.
-	mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
-	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
+	mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
+	renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
 	if err != nil {
 		return fmt.Errorf("creating RestoreEnvironment: %v", err)
 	}
@@ -362,10 +377,10 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("file cannot be empty")
 	}
 
-	if cm.l.conf.ProfileEnable {
-		// initializePProf opens /proc/self/maps, so has to be
-		// called before installing seccomp filters.
-		initializePProf()
+	if cm.l.root.conf.ProfileEnable {
+		// pprof.Initialize opens /proc/self/maps, so has to be called before
+		// installing seccomp filters.
+		pprof.Initialize()
 	}
 
 	// Seccomp filters have to be applied before parsing the state file.
@@ -380,12 +395,14 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	// Since we have a new kernel we also must make a new watchdog.
-	dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
 
 	// Change the loader fields to reflect the changes made when restoring.
 	cm.l.k = k
 	cm.l.watchdog = dog
-	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+	cm.l.root.procArgs = kernel.CreateProcessArgs{}
 	cm.l.restore = true
 
 	// Reinitialize the sandbox ID and processes map. Note that it doesn't
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
deleted file mode 100644
index e5de1f3d7..000000000
--- a/runsc/boot/fds.go
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/fs/host"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-)
-
-// createFDTable creates an FD table that contains stdin, stdout, and stderr.
-// If console is true, then ioctl calls will be passed through to the host FD.
-// Upon success, createFDMap dups then closes stdioFDs.
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
-	if len(stdioFDs) != 3 {
-		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
-	}
-
-	k := kernel.KernelFromContext(ctx)
-	fdTable := k.NewFDTable()
-	defer fdTable.DecRef()
-	mounter := fs.FileOwnerFromContext(ctx)
-
-	var ttyFile *fs.File
-	for appFD, hostFD := range stdioFDs {
-		var appFile *fs.File
-
-		if console && appFD < 3 {
-			// Import the file as a host TTY file.
-			if ttyFile == nil {
-				var err error
-				appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */)
-				if err != nil {
-					return nil, err
-				}
-				defer appFile.DecRef()
-
-				// Remember this in the TTY file, as we will
-				// use it for the other stdio FDs.
-				ttyFile = appFile
-			} else {
-				// Re-use the existing TTY file, as all three
-				// stdio FDs must point to the same fs.File in
-				// order to share TTY state, specifically the
-				// foreground process group id.
-				appFile = ttyFile
-			}
-		} else {
-			// Import the file as a regular host file.
-			var err error
-			appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */)
-			if err != nil {
-				return nil, err
-			}
-			defer appFile.DecRef()
-		}
-
-		// Add the file to the FD map.
-		if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
-			return nil, err
-		}
-	}
-
-	fdTable.IncRef()
-	return fdTable, nil
-}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index f5509b6b7..ed18f0047 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
@@ -6,12 +6,14 @@ go_library(
     name = "filter",
     srcs = [
         "config.go",
+        "config_amd64.go",
+        "config_arm64.go",
+        "config_profile.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
         "filter.go",
     ],
-    importpath = "gvisor.dev/gvisor/runsc/boot/filter",
     visibility = [
         "//runsc/boot:__subpackages__",
     ],
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 5ad108261..149eb0b1b 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -26,10 +26,6 @@ import (
 
 // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
-	syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-		{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	},
 	syscall.SYS_CLOCK_GETTIME: {},
 	syscall.SYS_CLONE: []seccomp.Rule{
 		{
@@ -42,9 +38,15 @@ var allowedSyscalls = seccomp.SyscallRules{
 					syscall.CLONE_THREAD),
 		},
 	},
-	syscall.SYS_CLOSE:         {},
-	syscall.SYS_DUP:           {},
-	syscall.SYS_DUP2:          {},
+	syscall.SYS_CLOSE: {},
+	syscall.SYS_DUP:   {},
+	syscall.SYS_DUP3: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.O_CLOEXEC),
+		},
+	},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
@@ -132,11 +134,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(syscall.SOL_SOCKET),
 			seccomp.AllowValue(syscall.SO_SNDBUF),
 		},
-		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_REUSEADDR),
-		},
 	},
 	syscall.SYS_GETTID:       {},
 	syscall.SYS_GETTIMEOFDAY: {},
@@ -177,6 +174,18 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_LSEEK:   {},
 	syscall.SYS_MADVISE: {},
 	syscall.SYS_MINCORE: {},
+	// Used by the Go runtime as a temporarily workaround for a Linux
+	// 5.2-5.4 bug.
+	//
+	// See src/runtime/os_linux_x86.go.
+	//
+	// TODO(b/148688965): Remove once this is gone from Go.
+	syscall.SYS_MLOCK: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(4096),
+		},
+	},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
@@ -220,7 +229,11 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_NANOSLEEP: {},
 	syscall.SYS_PPOLL:     {},
 	syscall.SYS_PREAD64:   {},
+	syscall.SYS_PREADV:    {},
+	unix.SYS_PREADV2:      {},
 	syscall.SYS_PWRITE64:  {},
+	syscall.SYS_PWRITEV:   {},
+	unix.SYS_PWRITEV2:     {},
 	syscall.SYS_READ:      {},
 	syscall.SYS_RECVMSG: []seccomp.Rule{
 		{
@@ -273,26 +286,36 @@ var allowedSyscalls = seccomp.SyscallRules{
 		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
+	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
+	syscall.SYS_TEE: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(1),                      /* len */
+			seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+		},
+	},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
 			seccomp.AllowValue(uint64(os.Getpid())),
 		},
 	},
-	syscall.SYS_WRITE: {},
-	// The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
-	// values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
-	// option is enabled for a packet socket.
-	syscall.SYS_WRITEV: []seccomp.Rule{
+	syscall.SYS_UTIMENSAT: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
+			seccomp.AllowValue(0), /* null pathname */
 			seccomp.AllowAny{},
-			seccomp.AllowValue(2),
+			seccomp.AllowValue(0), /* flags */
 		},
+	},
+	syscall.SYS_WRITE: {},
+	// For rawfile.NonBlockingWriteIovec.
+	syscall.SYS_WRITEV: []seccomp.Rule{
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(3),
+			seccomp.GreaterThan(0),
 		},
 	},
 }
@@ -315,6 +338,26 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
 				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+			},
+			{
+				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_IPV6),
 				seccomp.AllowValue(syscall.IPV6_V6ONLY),
 			},
@@ -416,6 +459,34 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.AllowAny{},
 				seccomp.AllowValue(4),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
 		},
 		syscall.SYS_SHUTDOWN: []seccomp.Rule{
 			{
@@ -479,16 +550,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
 		},
 	}
 }
-
-// profileFilters returns extra syscalls made by runtime/pprof package.
-func profileFilters() seccomp.SyscallRules {
-	return seccomp.SyscallRules{
-		syscall.SYS_OPENAT: []seccomp.Rule{
-			{
-				seccomp.AllowAny{},
-				seccomp.AllowAny{},
-				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
-			},
-		},
-	}
-}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
new file mode 100644
index 000000000..5335ff82c
--- /dev/null
+++ b/runsc/boot/filter/config_amd64.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
+		seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	)
+}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
new file mode 100644
index 000000000..7fa9bbda3
--- /dev/null
+++ b/runsc/boot/filter/config_arm64.go
@@ -0,0 +1,21 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+// Reserve for future customization.
+func init() {
+}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
new file mode 100644
index 000000000..194952a7b
--- /dev/null
+++ b/runsc/boot/filter/config_profile.go
@@ -0,0 +1,34 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// profileFilters returns extra syscalls made by runtime/pprof package.
+func profileFilters() seccomp.SyscallRules {
+	return seccomp.SyscallRules{
+		syscall.SYS_OPENAT: []seccomp.Rule{
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+			},
+		},
+	}
+}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index 5e5a3c998..209e646a7 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -26,6 +26,8 @@ import (
 func instrumentationFilters() seccomp.SyscallRules {
 	Report("MSAN is enabled: syscall filters less restrictive!")
 	return seccomp.SyscallRules{
+		syscall.SYS_CLONE:             {},
+		syscall.SYS_MMAP:              {},
 		syscall.SYS_SCHED_GETAFFINITY: {},
 		syscall.SYS_SET_ROBUST_LIST:   {},
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 76036c147..9dd5b0184 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"path/filepath"
 	"sort"
 	"strconv"
@@ -30,14 +29,22 @@ import (
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	procvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	sysvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -45,27 +52,19 @@ import (
 )
 
 const (
-	// Filesystem name for 9p gofer mounts.
-	rootFsName = "9p"
-
 	// Device name for root mount.
 	rootDevice = "9pfs-/"
 
 	// MountPrefix is the annotation prefix for mount hints.
-	MountPrefix = "gvisor.dev/spec/mount"
-
-	// Filesystems that runsc supports.
-	bind     = "bind"
-	devpts   = "devpts"
-	devtmpfs = "devtmpfs"
-	proc     = "proc"
-	sysfs    = "sysfs"
-	tmpfs    = "tmpfs"
-	nonefs   = "none"
+	MountPrefix = "dev.gvisor.spec.mount."
+
+	// Supported filesystems that map to different internal filesystem.
+	bind   = "bind"
+	nonefs = "none"
 )
 
 // tmpfs has some extra supported options that we must pass through.
-var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
+var tmpfsAllowedData = []string{"mode", "uid", "gid"}
 
 func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
@@ -109,12 +108,12 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 
 	// Always mount /dev.
 	mounts = append(mounts, specs.Mount{
-		Type:        devtmpfs,
+		Type:        devtmpfs.Name,
 		Destination: "/dev",
 	})
 
 	mounts = append(mounts, specs.Mount{
-		Type:        devpts,
+		Type:        devpts.Name,
 		Destination: "/dev/pts",
 	})
 
@@ -138,13 +137,13 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	var mandatoryMounts []specs.Mount
 	if !procMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        proc,
+			Type:        procvfs2.Name,
 			Destination: "/proc",
 		})
 	}
 	if !sysMounted {
 		mandatoryMounts = append(mandatoryMounts, specs.Mount{
-			Type:        sysfs,
+			Type:        sysvfs2.Name,
 			Destination: "/sys",
 		})
 	}
@@ -156,13 +155,17 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	return mounts
 }
 
-// p9MountOptions creates a slice of options for a p9 mount.
-func p9MountOptions(fd int, fa FileAccessType) []string {
+// p9MountData creates a slice of p9 mount data.
+func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
 		"wfdno=" + strconv.Itoa(fd),
-		"privateunixsocket=true",
+	}
+	if !vfs2 {
+		// privateunixsocket is always enabled in VFS2. VFS1 requires explicit
+		// enablement.
+		opts = append(opts, "privateunixsocket=true")
 	}
 	if fa == FileAccessShared {
 		opts = append(opts, "cache=remote_revalidating")
@@ -232,8 +235,8 @@ func isSupportedMountFlag(fstype, opt string) bool {
 	case "rw", "ro", "noatime", "noexec":
 		return true
 	}
-	if fstype == tmpfs {
-		ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
+	if fstype == tmpfsvfs2.Name {
+		ok, err := parseMountOption(opt, tmpfsAllowedData...)
 		return ok && err == nil
 	}
 	return false
@@ -279,6 +282,9 @@ func subtargets(root string, mnts []specs.Mount) []string {
 }
 
 func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	if conf.VFS2 {
+		return setupContainerVFS2(ctx, conf, mntr, procArgs)
+	}
 	mns, err := mntr.setupFS(conf, procArgs)
 	if err != nil {
 		return err
@@ -287,19 +293,12 @@ func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter,
 	// Set namespace here so that it can be found in ctx.
 	procArgs.MountNamespace = mns
 
-	return setExecutablePath(ctx, procArgs)
-}
-
-// setExecutablePath sets the procArgs.Filename by searching the PATH for an
-// executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
-	paths := fs.GetPath(procArgs.Envv)
-	exe := procArgs.Argv[0]
-	f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+	// Resolve the executable path from working dir and environment.
+	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
 	if err != nil {
-		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+		return err
 	}
-	procArgs.Filename = f
+	procArgs.Filename = resolved
 	return nil
 }
 
@@ -392,6 +391,10 @@ type mountHint struct {
 	// root is the inode where the volume is mounted. For mounts with 'pod' share
 	// the volume is mounted once and then bind mounted inside the containers.
 	root *fs.Inode
+
+	// vfsMount is the master mount for the volume. For mounts with 'pod' share
+	// the master volume is bind mounted inside the containers.
+	vfsMount *vfs.Mount
 }
 
 func (m *mountHint) setField(key, val string) error {
@@ -439,7 +442,7 @@ func (m *mountHint) setOptions(val string) error {
 }
 
 func (m *mountHint) isSupported() bool {
-	return m.mount.Type == tmpfs && m.share == pod
+	return m.mount.Type == tmpfsvfs2.Name && m.share == pod
 }
 
 // checkCompatible verifies that shared mount is compatible with master.
@@ -465,6 +468,13 @@ func (m *mountHint) checkCompatible(mount specs.Mount) error {
 	return nil
 }
 
+func (m *mountHint) fileAccessType() FileAccessType {
+	if m.share == container {
+		return FileAccessExclusive
+	}
+	return FileAccessShared
+}
+
 func filterUnsupportedOptions(mount specs.Mount) []string {
 	rv := make([]string, 0, len(mount.Options))
 	for _, o := range mount.Options {
@@ -483,14 +493,15 @@ type podMountHints struct {
 func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 	mnts := make(map[string]*mountHint)
 	for k, v := range spec.Annotations {
-		// Look for 'gvisor.dev/spec/mount' annotations and parse them.
+		// Look for 'dev.gvisor.spec.mount' annotations and parse them.
 		if strings.HasPrefix(k, MountPrefix) {
-			parts := strings.Split(k, "/")
-			if len(parts) != 5 {
+			// Remove the prefix and split the rest.
+			parts := strings.Split(k[len(MountPrefix):], ".")
+			if len(parts) != 2 {
 				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
 			}
-			name := parts[3]
-			if len(name) == 0 || path.Clean(name) != name {
+			name := parts[0]
+			if len(name) == 0 {
 				return nil, fmt.Errorf("invalid mount name: %s", name)
 			}
 			mnt := mnts[name]
@@ -498,7 +509,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 				mnt = &mountHint{name: name}
 				mnts[name] = mnt
 			}
-			if err := mnt.setField(parts[4], v); err != nil {
+			if err := mnt.setField(parts[1], v); err != nil {
 				return nil, err
 			}
 		}
@@ -565,9 +576,17 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 // processHints processes annotations that container hints about how volumes
 // should be mounted (e.g. a volume shared between containers). It must be
 // called for the root container only.
-func (c *containerMounter) processHints(conf *Config) error {
+func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
+	if conf.VFS2 {
+		return c.processHintsVFS2(conf, creds)
+	}
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfsvfs2.Name {
+			continue
+		}
 		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
 		inode, err := c.mountSharedMaster(ctx, conf, hint)
 		if err != nil {
@@ -621,7 +640,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
 
 func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
 	root := mns.Root()
-	defer root.DecRef()
+	defer root.DecRef(ctx)
 
 	for _, m := range c.mounts {
 		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
@@ -702,7 +721,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	fd := c.fds.remove()
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
 	p9FS := mustFindFilesystem("9p")
-	opts := p9MountOptions(fd, conf.FileAccess)
+	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
 
 	if conf.OverlayfsStaleRead {
 		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
@@ -748,36 +767,40 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	)
 
 	switch m.Type {
-	case devpts, devtmpfs, proc, sysfs:
+	case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
 		fsName = m.Type
 	case nonefs:
-		fsName = sysfs
-	case tmpfs:
+		fsName = sysvfs2.Name
+	case tmpfsvfs2.Name:
 		fsName = m.Type
 
 		var err error
-		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
 			return "", nil, false, err
 		}
 
 	case bind:
 		fd := c.fds.remove()
-		fsName = "9p"
-		// Non-root bind mounts are always shared.
-		opts = p9MountOptions(fd, FileAccessShared)
+		fsName = gofervfs2.Name
+		opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2)
 		// If configured, add overlay to all writable mounts.
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
 	default:
-		// TODO(nlacasse): Support all the mount types and make this a fatal error.
-		// Most applications will "just work" without them, so this is a warning
-		// for now.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
 	return fsName, opts, useOverlay, nil
 }
 
+func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+	if hint := c.hints.findMount(mount); hint != nil {
+		return hint.fileAccessType()
+	}
+	// Non-root bind mounts are always shared if no hints were provided.
+	return FileAccessShared
+}
+
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
@@ -805,7 +828,20 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 
 	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
 	if err != nil {
-		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		err := fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+		// Check to see if this is a common error due to a Linux bug.
+		// This error is generated here in order to cause it to be
+		// printed to the user using Docker via 'runsc create' etc. rather
+		// than simply printed to the logs for the 'runsc boot' command.
+		//
+		// We check the error message string rather than type because the
+		// actual error types (syscall.EIO, syscall.EPIPE) are lost by file system
+		// implementation (e.g. p9).
+		// TODO(gvisor.dev/issue/1765): Remove message when bug is resolved.
+		if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) {
+			return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug"))
+		}
+		return err
 	}
 
 	// If there are submounts, we need to overlay the mount on top of a ramfs
@@ -832,12 +868,12 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 	if err != nil {
 		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
 	}
-	defer dirent.DecRef()
+	defer dirent.DecRef(ctx)
 	if err := mns.Mount(ctx, dirent, inode); err != nil {
 		return fmt.Errorf("mount %q error: %v", m.Destination, err)
 	}
 
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
 	return nil
 }
 
@@ -853,12 +889,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
 	if err != nil {
 		return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
 	}
-	defer target.DecRef()
+	defer target.DecRef(ctx)
 
 	// Take a ref on the inode that is about to be (re)-mounted.
 	source.root.IncRef()
 	if err := mns.Mount(ctx, target, source.root); err != nil {
-		source.root.DecRef()
+		source.root.DecRef(ctx)
 		return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
 	}
 
@@ -900,7 +936,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 
 	// Add root mount.
 	fd := c.fds.remove()
-	opts := p9MountOptions(fd, conf.FileAccess)
+	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
 
 	mf := fs.MountSourceFlags{}
 	if c.root.Readonly || conf.Overlay {
@@ -912,7 +948,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 		Flags:      mf,
 		DataString: strings.Join(opts, ","),
 	}
-	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+	renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount)
 
 	// Add submounts.
 	var tmpMounted bool
@@ -928,7 +964,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
 	if !tmpMounted {
 		tmpMount := specs.Mount{
-			Type:        tmpfs,
+			Type:        tmpfsvfs2.Name,
 			Destination: "/tmp",
 		}
 		if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
@@ -961,12 +997,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
 	switch err {
 	case nil:
 		// Found '/tmp' in filesystem, check if it's empty.
-		defer tmp.DecRef()
+		defer tmp.DecRef(ctx)
 		f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
 		if err != nil {
 			return err
 		}
-		defer f.DecRef()
+		defer f.DecRef(ctx)
 		serializer := &fs.CollectEntriesSerializer{}
 		if err := f.Readdir(ctx, serializer); err != nil {
 			return err
@@ -984,11 +1020,11 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
 		// No '/tmp' found (or fallthrough from above). Safe to mount internal
 		// tmpfs.
 		tmpMount := specs.Mount{
-			Type:        tmpfs,
+			Type:        tmpfsvfs2.Name,
 			Destination: "/tmp",
 			// Sticky bit is added to prevent accidental deletion of files from
 			// another user. This is normally done for /tmp.
-			Options: []string{"mode=1777"},
+			Options: []string{"mode=01777"},
 		}
 		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
 
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 49ab34b33..912037075 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -15,7 +15,6 @@
 package boot
 
 import (
-	"path"
 	"reflect"
 	"strings"
 	"testing"
@@ -26,19 +25,19 @@ import (
 func TestPodMountHintsHappy(t *testing.T) {
 	spec := &specs.Spec{
 		Annotations: map[string]string{
-			path.Join(MountPrefix, "mount1", "source"): "foo",
-			path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-			path.Join(MountPrefix, "mount1", "share"):  "pod",
+			MountPrefix + "mount1.source": "foo",
+			MountPrefix + "mount1.type":   "tmpfs",
+			MountPrefix + "mount1.share":  "pod",
 
-			path.Join(MountPrefix, "mount2", "source"):  "bar",
-			path.Join(MountPrefix, "mount2", "type"):    "bind",
-			path.Join(MountPrefix, "mount2", "share"):   "container",
-			path.Join(MountPrefix, "mount2", "options"): "rw,private",
+			MountPrefix + "mount2.source":  "bar",
+			MountPrefix + "mount2.type":    "bind",
+			MountPrefix + "mount2.share":   "container",
+			MountPrefix + "mount2.options": "rw,private",
 		},
 	}
 	podHints, err := newPodMountHints(spec)
 	if err != nil {
-		t.Errorf("newPodMountHints failed: %v", err)
+		t.Fatalf("newPodMountHints failed: %v", err)
 	}
 
 	// Check that fields were set correctly.
@@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) {
 		{
 			name: "too short",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1"): "foo",
+				MountPrefix + "mount1": "foo",
 			},
 			error: "invalid mount annotation",
 		},
 		{
 			name: "no name",
 			annotations: map[string]string{
-				MountPrefix + "//source": "foo",
+				MountPrefix + ".source": "foo",
 			},
 			error: "invalid mount name",
 		},
 		{
 			name: "missing source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "type"):  "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"): "pod",
+				MountPrefix + "mount1.type":  "tmpfs",
+				MountPrefix + "mount1.share": "pod",
 			},
 			error: "source field",
 		},
 		{
 			name: "missing type",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "type field",
 		},
 		{
 			name: "missing share",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
 			},
 			error: "share field",
 		},
 		{
 			name: "invalid field name",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "invalid"): "foo",
+				MountPrefix + "mount1.invalid": "foo",
 			},
 			error: "invalid mount annotation",
 		},
 		{
 			name: "invalid source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "source cannot be empty",
 		},
 		{
 			name: "invalid type",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "invalid-type",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "invalid-type",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "invalid type",
 		},
 		{
 			name: "invalid share",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "invalid-share",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "invalid-share",
 			},
 			error: "invalid share",
 		},
 		{
 			name: "invalid options",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"):  "foo",
-				path.Join(MountPrefix, "mount1", "type"):    "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):   "pod",
-				path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+				MountPrefix + "mount1.source":  "foo",
+				MountPrefix + "mount1.type":    "tmpfs",
+				MountPrefix + "mount1.share":   "pod",
+				MountPrefix + "mount1.options": "invalid-option",
 			},
 			error: "unknown mount option",
 		},
 		{
 			name: "duplicate source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
 
-				path.Join(MountPrefix, "mount2", "source"): "foo",
-				path.Join(MountPrefix, "mount2", "type"):   "bind",
-				path.Join(MountPrefix, "mount2", "share"):  "container",
+				MountPrefix + "mount2.source": "foo",
+				MountPrefix + "mount2.type":   "bind",
+				MountPrefix + "mount2.share":  "container",
 			},
 			error: "have the same mount source",
 		},
@@ -191,3 +190,61 @@ func TestPodMountHintsErrors(t *testing.T) {
 		})
 	}
 }
+
+func TestGetMountAccessType(t *testing.T) {
+	const source = "foo"
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		want        FileAccessType
+	}{
+		{
+			name: "container=exclusive",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
+			},
+			want: FileAccessExclusive,
+		},
+		{
+			name: "pod=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "shared=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "shared",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "default=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source + "mismatch",
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
+			},
+			want: FileAccessShared,
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err != nil {
+				t.Fatalf("newPodMountHints failed: %v", err)
+			}
+			mounter := containerMounter{hints: podHints}
+			if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want {
+				t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got)
+			}
+		})
+	}
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index d1c0bb9b5..ce62236e5 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -16,12 +16,12 @@ package boot
 
 import (
 	"fmt"
-	"sync"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // Mapping from linux resource names to limits.LimitType.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0c0eba99e..40c6f99fd 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,26 +16,30 @@
 package boot
 
 import (
+	"errors"
 	"fmt"
 	mrand "math/rand"
 	"os"
 	"runtime"
-	"sync"
 	"sync/atomic"
-	"syscall"
 	gtime "time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/fdimport"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -43,11 +47,14 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/sighandling"
-	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -59,43 +66,46 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/runsc/boot/filter"
 	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
 
-// Loader keeps state needed to start the kernel and run the container..
-type Loader struct {
-	// k is the kernel.
-	k *kernel.Kernel
-
-	// ctrl is the control server.
-	ctrl *controller
-
+type containerInfo struct {
 	conf *Config
 
-	// console is set to true if terminal is enabled.
-	console bool
+	// spec is the base configuration for the root container.
+	spec *specs.Spec
 
-	watchdog *watchdog.Watchdog
+	// procArgs refers to the container's init task.
+	procArgs kernel.CreateProcessArgs
 
 	// stdioFDs contains stdin, stdout, and stderr.
 	stdioFDs []int
 
 	// goferFDs are the FDs that attach the sandbox to the gofers.
 	goferFDs []int
+}
 
-	// spec is the base configuration for the root container.
-	spec *specs.Spec
+// Loader keeps state needed to start the kernel and run the container..
+type Loader struct {
+	// k is the kernel.
+	k *kernel.Kernel
+
+	// ctrl is the control server.
+	ctrl *controller
 
-	// startSignalForwarding enables forwarding of signals to the sandboxed
-	// container. It should be called after the init process is loaded.
-	startSignalForwarding func() func()
+	// root contains information about the root container in the sandbox.
+	root containerInfo
+
+	watchdog *watchdog.Watchdog
 
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
 	// container. It should be called when a sandbox is destroyed.
@@ -104,9 +114,6 @@ type Loader struct {
 	// restore is set to true if we are restoring a container.
 	restore bool
 
-	// rootProcArgs refers to the root sandbox init task.
-	rootProcArgs kernel.CreateProcessArgs
-
 	// sandboxID is the ID for the whole sandbox.
 	sandboxID string
 
@@ -139,6 +146,9 @@ type execProcess struct {
 	// tty will be nil if the process is not attached to a terminal.
 	tty *host.TTYFileOperations
 
+	// tty will be nil if the process is not attached to a terminal.
+	ttyVFS2 *hostvfs2.TTYFileDescription
+
 	// pidnsPath is the pid namespace path in spec
 	pidnsPath string
 }
@@ -146,9 +156,6 @@ type execProcess struct {
 func init() {
 	// Initialize the random number generator.
 	mrand.Seed(gtime.Now().UnixNano())
-
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(slinux.AMD64)
 }
 
 // Args are the arguments for New().
@@ -159,16 +166,18 @@ type Args struct {
 	Spec *specs.Spec
 	// Conf is the system configuration.
 	Conf *Config
-	// ControllerFD is the FD to the URPC controller.
+	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
+	// of this FD and may close it at any time.
 	ControllerFD int
-	// Device is an optional argument that is passed to the platform.
+	// Device is an optional argument that is passed to the platform. The Loader
+	// takes ownership of this file and may close it at any time.
 	Device *os.File
-	// GoferFDs is an array of FDs used to connect with the Gofer.
+	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
+	// takes ownership of these FDs and may close them at any time.
 	GoferFDs []int
-	// StdioFDs is the stdio for the application.
+	// StdioFDs is the stdio for the application. The Loader takes ownership of
+	// these FDs and may close them at any time.
 	StdioFDs []int
-	// Console is set to true if using TTY.
-	Console bool
 	// NumCPU is the number of CPUs to create inside the sandbox.
 	NumCPU int
 	// TotalMem is the initial amount of total memory to report back to the
@@ -178,6 +187,9 @@ type Args struct {
 	UserLogFD int
 }
 
+// make sure stdioFDs are always the same on initial start and on restore
+const startingStdioFD = 256
+
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
 func New(args Args) (*Loader, error) {
@@ -191,6 +203,16 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("setting up memory usage: %v", err)
 	}
 
+	// Is this a VFSv2 kernel?
+	if args.Conf.VFS2 {
+		kernel.VFS2Enabled = true
+		if args.Conf.FUSE {
+			kernel.FUSEEnabled = true
+		}
+
+		vfs2.Override()
+	}
+
 	// Create kernel and platform.
 	p, err := createPlatform(args.Conf, args.Device)
 	if err != nil {
@@ -210,9 +232,7 @@ func New(args Args) (*Loader, error) {
 	// Create VDSO.
 	//
 	// Pass k as the platform since it is savable, unlike the actual platform.
-	//
-	// FIXME(b/109889800): Use non-nil context.
-	vdso, err := loader.PrepareVDSO(nil, k)
+	vdso, err := loader.PrepareVDSO(k)
 	if err != nil {
 		return nil, fmt.Errorf("creating vdso: %v", err)
 	}
@@ -228,11 +248,8 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("enabling strace: %v", err)
 	}
 
-	// Create an empty network stack because the network namespace may be empty at
-	// this point. Netns is configured before Run() is called. Netstack is
-	// configured using a control uRPC message. Host network is configured inside
-	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k)
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -275,7 +292,7 @@ func New(args Args) (*Loader, error) {
 		FeatureSet:                  cpuid.HostFeatureSet(),
 		Timekeeper:                  tk,
 		RootUserNamespace:           creds.UserNamespace,
-		NetworkStack:                networkStack,
+		RootNetworkNamespace:        netns,
 		ApplicationCores:            uint(args.NumCPU),
 		Vdso:                        vdso,
 		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
@@ -286,6 +303,12 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("initializing kernel: %v", err)
 	}
 
+	if kernel.VFS2Enabled {
+		if err := registerFilesystems(k); err != nil {
+			return nil, fmt.Errorf("registering filesystems: %w", err)
+		}
+	}
+
 	if err := adjustDirentCache(k); err != nil {
 		return nil, err
 	}
@@ -300,9 +323,11 @@ func New(args Args) (*Loader, error) {
 	}
 
 	// Create a watchdog.
-	dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
 
-	procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+	procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
 	if err != nil {
 		return nil, fmt.Errorf("creating init process for root container: %v", err)
 	}
@@ -316,19 +341,57 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("creating pod mount hints: %v", err)
 	}
 
+	if kernel.VFS2Enabled {
+		// Set up host mount that will be used for imported fds.
+		hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS())
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
+		}
+		defer hostFilesystem.DecRef(k.SupervisorContext())
+		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
+		}
+		k.SetHostMount(hostMount)
+	}
+
+	// Make host FDs stable between invocations. Host FDs must map to the exact
+	// same number when the sandbox is restored. Otherwise the wrong FD will be
+	// used.
+	var stdioFDs []int
+	newfd := startingStdioFD
+	for _, fd := range args.StdioFDs {
+		// Check that newfd is unused to avoid clobbering over it.
+		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+			if err != nil {
+				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+			}
+			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
+		}
+
+		err := unix.Dup3(fd, newfd, unix.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+		}
+		stdioFDs = append(stdioFDs, newfd)
+		_ = unix.Close(fd)
+		newfd++
+	}
+
 	eid := execID{cid: args.ID}
 	l := &Loader{
-		k:            k,
-		conf:         args.Conf,
-		console:      args.Console,
-		watchdog:     dog,
-		spec:         args.Spec,
-		goferFDs:     args.GoferFDs,
-		stdioFDs:     args.StdioFDs,
-		rootProcArgs: procArgs,
-		sandboxID:    args.ID,
-		processes:    map[execID]*execProcess{eid: {}},
-		mountHints:   mountHints,
+		k:          k,
+		watchdog:   dog,
+		sandboxID:  args.ID,
+		processes:  map[execID]*execProcess{eid: {}},
+		mountHints: mountHints,
+		root: containerInfo{
+			conf:     args.Conf,
+			stdioFDs: stdioFDs,
+			goferFDs: args.GoferFDs,
+			spec:     args.Spec,
+			procArgs: procArgs,
+		},
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -337,29 +400,6 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
 	}
 
-	// Handle signals by forwarding them to the root container process
-	// (except for panic signal, which should cause a panic).
-	l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
-		// Panic signal should cause a panic.
-		if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
-			panic("Signal-induced panic")
-		}
-
-		// Otherwise forward to root container.
-		deliveryMode := DeliverToProcess
-		if args.Console {
-			// Since we are running with a console, we should
-			// forward the signal to the foreground process group
-			// so that job control signals like ^C can be handled
-			// properly.
-			deliveryMode = DeliverToForegroundProcessGroup
-		}
-		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
-		if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
-			log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
-		}
-	})
-
 	// Create the control server using the provided FD.
 	//
 	// This must be done *after* we have initialized the kernel since the
@@ -379,19 +419,24 @@ func New(args Args) (*Loader, error) {
 	return l, nil
 }
 
-// newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+// createProcessArgs creates args that can be used with kernel.CreateProcess.
+func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
 	// Create initial limits.
 	ls, err := createLimitSet(spec)
 	if err != nil {
 		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
 	}
 
+	wd := spec.Process.Cwd
+	if wd == "" {
+		wd = "/"
+	}
+
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
 		Argv:                    spec.Process.Args,
 		Envv:                    spec.Process.Env,
-		WorkingDirectory:        spec.Process.Cwd, // Defaults to '/' if empty.
+		WorkingDirectory:        wd,
 		Credentials:             creds,
 		Umask:                   0022,
 		Limits:                  ls,
@@ -419,6 +464,11 @@ func (l *Loader) Destroy() {
 		l.stopSignalForwarding()
 	}
 	l.watchdog.Stop()
+
+	for i, fd := range l.root.stdioFDs {
+		_ = unix.Close(fd)
+		l.root.stdioFDs[i] = -1
+	}
 }
 
 func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
@@ -449,13 +499,13 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 }
 
 func (l *Loader) installSeccompFilters() error {
-	if l.conf.DisableSeccomp {
+	if l.root.conf.DisableSeccomp {
 		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
 	} else {
 		opts := filter.Options{
 			Platform:      l.k.Platform,
-			HostNetwork:   l.conf.Network == NetworkHost,
-			ProfileEnable: l.conf.ProfileEnable,
+			HostNetwork:   l.root.conf.Network == NetworkHost,
+			ProfileEnable: l.root.conf.ProfileEnable,
 			ControllerFD:  l.ctrl.srv.FD(),
 		}
 		if err := filter.Install(opts); err != nil {
@@ -481,11 +531,11 @@ func (l *Loader) Run() error {
 }
 
 func (l *Loader) run() error {
-	if l.conf.Network == NetworkHost {
+	if l.root.conf.Network == NetworkHost {
 		// Delay host network configuration to this point because network namespace
 		// is configured after the loader is created and before Run() is called.
 		log.Debugf("Configuring host network")
-		stack := l.k.NetworkStack().(*hostinet.Stack)
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
 		if err := stack.Configure(); err != nil {
 			return err
 		}
@@ -503,8 +553,8 @@ func (l *Loader) run() error {
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
-		if l.conf.ProfileEnable {
-			initializePProf()
+		if l.root.conf.ProfileEnable {
+			pprof.Initialize()
 		}
 
 		// Finally done with all configuration. Setup filters before user code
@@ -513,62 +563,50 @@ func (l *Loader) run() error {
 			return err
 		}
 
-		// Create the FD map, which will set stdin, stdout, and stderr.  If console
-		// is true, then ioctl calls will be passed through to the host fd.
-		ctx := l.rootProcArgs.NewContext(l.k)
-		fdTable, err := createFDTable(ctx, l.console, l.stdioFDs)
-		if err != nil {
-			return fmt.Errorf("importing fds: %v", err)
-		}
-		// CreateProcess takes a reference on FDMap if successful. We won't need
-		// ours either way.
-		l.rootProcArgs.FDTable = fdTable
-
-		// Setup the root container file system.
-		l.startGoferMonitor(l.sandboxID, l.goferFDs)
-
-		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
-		if err := mntr.processHints(l.conf); err != nil {
-			return err
-		}
-		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
-			return err
-		}
-
-		// Add the HOME enviroment variable if it is not already set.
-		envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-		if err != nil {
-			return err
-		}
-		l.rootProcArgs.Envv = envv
-
 		// Create the root container init task. It will begin running
 		// when the kernel is started.
-		if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
-			return fmt.Errorf("creating init process: %v", err)
+		if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+			return err
 		}
-
-		// CreateProcess takes a reference on FDTable if successful.
-		l.rootProcArgs.FDTable.DecRef()
 	}
 
 	ep.tg = l.k.GlobalInit()
-	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
 		ep.pidnsPath = ns.Path
 	}
-	if l.console {
-		ttyFile, _ := l.rootProcArgs.FDTable.Get(0)
-		defer ttyFile.DecRef()
-		ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
 
-		// Set the foreground process group on the TTY to the global
-		// init process group, since that is what we are about to
-		// start running.
-		ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
-	}
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
 
-	// Start signal forwarding only after an init process is created.
-	l.stopSignalForwarding = l.startSignalForwarding()
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if l.root.spec.Process.Terminal {
+			// Since we are running with a console, we should forward the signal to
+			// the foreground process group so that job control signals like ^C can
+			// be handled properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+		}
+	})
+
+	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
+	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
+	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
+	// passed FDs, so only close for VFS1.
+	if !kernel.VFS2Enabled {
+		for i, fd := range l.root.stdioFDs {
+			_ = unix.Close(fd)
+			l.root.stdioFDs[i] = -1
+		}
+	}
 
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
@@ -601,8 +639,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	eid := execID{cid: cid}
-	if _, ok := l.processes[eid]; !ok {
+	ep := l.processes[execID{cid: cid}]
+	if ep == nil {
 		return fmt.Errorf("trying to start a deleted container %q", cid)
 	}
 
@@ -636,61 +674,112 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 		if pidns == nil {
 			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
 		}
-		l.processes[eid].pidnsPath = ns.Path
+		ep.pidnsPath = ns.Path
 	} else {
 		pidns = l.k.RootPIDNamespace()
 	}
-	procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+
+	info := &containerInfo{
+		conf: conf,
+		spec: spec,
+	}
+	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
 	if err != nil {
 		return fmt.Errorf("creating new process: %v", err)
 	}
 
 	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
-	var stdioFDs []int
 	for _, f := range files[:3] {
-		stdioFDs = append(stdioFDs, int(f.Fd()))
-	}
-
-	// Create the FD map, which will set stdin, stdout, and stderr.
-	ctx := procArgs.NewContext(l.k)
-	fdTable, err := createFDTable(ctx, false, stdioFDs)
-	if err != nil {
-		return fmt.Errorf("importing fds: %v", err)
+		info.stdioFDs = append(info.stdioFDs, int(f.Fd()))
 	}
-	// CreateProcess takes a reference on fdTable if successful. We won't
-	// need ours either way.
-	procArgs.FDTable = fdTable
 
 	// Can't take ownership away from os.File. dup them to get a new FDs.
-	var goferFDs []int
 	for _, f := range files[3:] {
-		fd, err := syscall.Dup(int(f.Fd()))
+		fd, err := unix.Dup(int(f.Fd()))
 		if err != nil {
 			return fmt.Errorf("failed to dup file: %v", err)
 		}
-		goferFDs = append(goferFDs, fd)
+		info.goferFDs = append(info.goferFDs, fd)
 	}
 
+	tg, err := l.createContainerProcess(false, cid, info, ep)
+	if err != nil {
+		return err
+	}
+
+	// Success!
+	l.k.StartProcess(tg)
+	ep.tg = tg
+	return nil
+}
+
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
+	console := false
+	if root {
+		// Only root container supports terminal for now.
+		console = info.spec.Process.Terminal
+	}
+
+	// Create the FD map, which will set stdin, stdout, and stderr.
+	ctx := info.procArgs.NewContext(l.k)
+	fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
+	if err != nil {
+		return nil, fmt.Errorf("importing fds: %v", err)
+	}
+	// CreateProcess takes a reference on fdTable if successful. We won't need
+	// ours either way.
+	info.procArgs.FDTable = fdTable
+
 	// Setup the child container file system.
-	l.startGoferMonitor(cid, goferFDs)
+	l.startGoferMonitor(cid, info.goferFDs)
 
-	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
-	if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
-		return err
+	mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
+	if root {
+		if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
+			return nil, err
+		}
+	}
+	if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
+		return nil, err
 	}
 
-	// Create and start the new process.
-	tg, _, err := l.k.CreateProcess(procArgs)
+	// Add the HOME enviroment variable if it is not already set.
+	var envv []string
+	if kernel.VFS2Enabled {
+		envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
+			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
+
+	} else {
+		envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
+			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
+	}
 	if err != nil {
-		return fmt.Errorf("creating process: %v", err)
+		return nil, err
 	}
-	l.k.StartProcess(tg)
+	info.procArgs.Envv = envv
 
+	// Create and start the new process.
+	tg, _, err := l.k.CreateProcess(info.procArgs)
+	if err != nil {
+		return nil, fmt.Errorf("creating process: %v", err)
+	}
 	// CreateProcess takes a reference on FDTable if successful.
-	procArgs.FDTable.DecRef()
+	info.procArgs.FDTable.DecRef(ctx)
+
+	// Set the foreground process group on the TTY to the global init process
+	// group, since that is what we are about to start running.
+	if root {
+		switch {
+		case ttyFileVFS2 != nil:
+			ep.ttyVFS2 = ttyFileVFS2
+			ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+		case ttyFile != nil:
+			ep.tty = ttyFile
+			ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
+		}
+	}
 
-	l.processes[eid].tg = tg
-	return nil
+	return tg, nil
 }
 
 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
@@ -738,14 +827,14 @@ func (l *Loader) destroyContainer(cid string) error {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	_, _, started, err := l.threadGroupFromIDLocked(execID{cid: cid})
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
 	if err != nil {
 		// Container doesn't exist.
 		return err
 	}
 
-	// The container exists, has it been started?
-	if started {
+	// The container exists, but has it been started?
+	if tg != nil {
 		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
 			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
 		}
@@ -787,45 +876,63 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	tg, _, started, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
 	if err != nil {
 		return 0, err
 	}
-	if !started {
+	if tg == nil {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
 	// Get the container MountNamespace from the Task.
-	tg.Leader().WithMuLocked(func(t *kernel.Task) {
-		// task.MountNamespace() does not take a ref, so we must do so
-		// ourselves.
-		args.MountNamespace = t.MountNamespace()
-		args.MountNamespace.IncRef()
-	})
-	defer args.MountNamespace.DecRef()
+	if kernel.VFS2Enabled {
+		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
+		args.MountNamespaceVFS2.IncRef()
+	} else {
+		tg.Leader().WithMuLocked(func(t *kernel.Task) {
+			// task.MountNamespace() does not take a ref, so we must do so ourselves.
+			args.MountNamespace = t.MountNamespace()
+			args.MountNamespace.IncRef()
+		})
+	}
 
-	// Add the HOME enviroment varible if it is not already set.
-	root := args.MountNamespace.Root()
-	defer root.DecRef()
-	ctx := fs.WithRoot(l.k.SupervisorContext(), root)
-	envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
-	if err != nil {
-		return 0, err
+	// Add the HOME environment variable if it is not already set.
+	if kernel.VFS2Enabled {
+		root := args.MountNamespaceVFS2.Root()
+		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+		defer args.MountNamespaceVFS2.DecRef(ctx)
+		defer root.DecRef(ctx)
+		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
+	} else {
+		root := args.MountNamespace.Root()
+		ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+		defer args.MountNamespace.DecRef(ctx)
+		defer root.DecRef(ctx)
+		envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
 	}
-	args.Envv = envv
 
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
 	args.PIDNamespace = tg.PIDNamespace()
-	newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
+	newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args)
 	if err != nil {
 		return 0, err
 	}
 
 	eid := execID{cid: args.ContainerID, pid: tgid}
 	l.processes[eid] = &execProcess{
-		tg:  newTG,
-		tty: ttyFile,
+		tg:      newTG,
+		tty:     ttyFile,
+		ttyVFS2: ttyFileVFS2,
 	}
 	log.Debugf("updated processes: %v", l.processes)
 
@@ -836,7 +943,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	// Don't defer unlock, as doing so would make it impossible for
 	// multiple clients to wait on the same container.
-	tg, _, err := l.threadGroupFromID(execID{cid: cid})
+	tg, err := l.threadGroupFromID(execID{cid: cid})
 	if err != nil {
 		return fmt.Errorf("can't wait for container %q: %v", cid, err)
 	}
@@ -855,7 +962,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e
 
 	// Try to find a process that was exec'd
 	eid := execID{cid: cid, pid: tgid}
-	execTG, _, err := l.threadGroupFromID(eid)
+	execTG, err := l.threadGroupFromID(eid)
 	if err == nil {
 		ws := l.wait(execTG)
 		*waitStatus = ws
@@ -869,7 +976,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e
 
 	// The caller may be waiting on a process not started directly via exec.
 	// In this case, find the process in the container's PID namespace.
-	initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
 	if err != nil {
 		return fmt.Errorf("waiting for PID %d: %v", tgid, err)
 	}
@@ -902,50 +1009,98 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	// Wait for container.
 	l.k.WaitExited()
 
+	// Cleanup
+	l.ctrl.stop()
+
+	refs.OnExit()
+
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
 	switch conf.Network {
 	case NetworkHost:
-		return hostinet.NewStack(), nil
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
 
 	case NetworkNone, NetworkSandbox:
-		// NetworkNone sets up loopback using netstack.
-		netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
-		transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
-		s := netstack.Stack{stack.New(stack.Options{
-			NetworkProtocols:   netProtos,
-			TransportProtocols: transProtos,
-			Clock:              clock,
-			Stats:              netstack.Metrics,
-			HandleLocal:        true,
-			// Enable raw sockets for users with sufficient
-			// privileges.
-			RawFactory: raw.EndpointFactory{},
-		})}
-
-		// Enable SACK Recovery.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
-			return nil, fmt.Errorf("failed to enable SACK: %v", err)
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
 		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
+		}
+		return inet.NewRootNamespace(s, creator), nil
 
-		// Set default TTLs as required by socket/netstack.
-		s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
-		s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
 
-		// Enable Receive Buffer Auto-Tuning.
-		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
-		}
+}
 
-		s.FillDefaultIPTables()
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
 
-		return &s, nil
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %s", err)
+	}
 
-	default:
-		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
+	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
 	}
+
+	return s, nil
 }
 
 // signal sends a signal to one or more processes in a container. If PID is 0,
@@ -975,8 +1130,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
 			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
 		}
 		// Check that the container has actually started before signaling it.
-		_, _, err := l.threadGroupFromID(execID{cid: cid})
-		if err != nil {
+		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
 			return err
 		}
 		if err := l.signalAllProcesses(cid, signo); err != nil {
@@ -990,16 +1144,16 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
 }
 
 func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
-	execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
 	if err == nil {
 		// Send signal directly to the identified process.
-		return execTG.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
 	}
 
 	// The caller may be signaling a process not started directly via exec.
 	// In this case, find the process in the container's PID namespace and
 	// signal it.
-	initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
 	if err != nil {
 		return fmt.Errorf("no thread group found: %v", err)
 	}
@@ -1010,25 +1164,43 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
 	if tg.Leader().ContainerID() != cid {
 		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
 	}
-	return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+	return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
 }
 
+// signalForegrondProcessGroup looks up foreground process group from the TTY
+// for the given "tgid" inside container "cid", and send the signal to it.
 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
-	// Lookup foreground process group from the TTY for the given process,
-	// and send the signal to it.
-	tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	l.mu.Lock()
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
 	if err != nil {
+		l.mu.Unlock()
 		return fmt.Errorf("no thread group found: %v", err)
 	}
-	if tty == nil {
+	if tg == nil {
+		l.mu.Unlock()
+		return fmt.Errorf("container %q not started", cid)
+	}
+
+	tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
+	l.mu.Unlock()
+	if err != nil {
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+
+	var pg *kernel.ProcessGroup
+	switch {
+	case ttyVFS2 != nil:
+		pg = ttyVFS2.ForegroundProcessGroup()
+	case tty != nil:
+		pg = tty.ForegroundProcessGroup()
+	default:
 		return fmt.Errorf("no TTY attached")
 	}
-	pg := tty.ForegroundProcessGroup()
 	if pg == nil {
 		// No foreground process group has been set. Signal the
 		// original thread group.
 		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
-		return tg.SendSignal(&arch.SignalInfo{Signo: signo})
+		return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
 	}
 	// Send the signal to all processes in the process group.
 	var lastErr error
@@ -1036,7 +1208,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s
 		if tg.ProcessGroup() != pg {
 			continue
 		}
-		if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil {
+		if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil {
 			lastErr = err
 		}
 	}
@@ -1054,33 +1226,57 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error {
 	return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
 }
 
-// threadGroupFromID same as threadGroupFromIDLocked except that it acquires
-// mutex before calling it.
-func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
+// acquires mutex before calling it and fails in case container hasn't started
+// yet.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	tg, tty, ok, err := l.threadGroupFromIDLocked(key)
+	tg, err := l.tryThreadGroupFromIDLocked(key)
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
-	if !ok {
-		return nil, nil, fmt.Errorf("container %q not started", key.cid)
+	if tg == nil {
+		return nil, fmt.Errorf("container %q not started", key.cid)
 	}
-	return tg, tty, nil
+	return tg, nil
 }
 
-// threadGroupFromIDLocked returns the thread group and TTY for the given
-// execution ID. TTY may be nil if the process is not attached to a terminal.
-// Also returns a boolean indicating whether the container has already started.
-// Returns error if execution ID is invalid or if the container cannot be
-// found (maybe it has been deleted). Caller must hold 'mu'.
-func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, bool, error) {
+// tryThreadGroupFromIDLocked returns the thread group for the given execution
+// ID. It may return nil in case the container has not started yet. Returns
+// error if execution ID is invalid or if the container cannot be found (maybe
+// it has been deleted). Caller must hold 'mu'.
+func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
 	ep := l.processes[key]
 	if ep == nil {
-		return nil, nil, false, fmt.Errorf("container %q not found", key.cid)
+		return nil, fmt.Errorf("container %q not found", key.cid)
 	}
-	if ep.tg == nil {
-		return nil, nil, false, nil
+	return ep.tg, nil
+}
+
+// ttyFromIDLocked returns the TTY files for the given execution ID. It may
+// return nil in case the container has not started yet. Returns error if
+// execution ID is invalid or if the container cannot be found (maybe it has
+// been deleted). Caller must hold 'mu'.
+func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, nil, fmt.Errorf("container %q not found", key.cid)
+	}
+	return ep.tty, ep.ttyVFS2, nil
+}
+
+func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	if len(stdioFDs) != 3 {
+		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+	}
+
+	k := kernel.KernelFromContext(ctx)
+	fdTable := k.NewFDTable()
+	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
+	if err != nil {
+		fdTable.DecRef(ctx)
+		return nil, nil, nil, err
 	}
-	return ep.tg, ep.tty, true, nil
+	return fdTable, ttyFile, ttyFileVFS2, nil
 }
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 147ff7703..aa3fdf96c 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -19,17 +19,20 @@ import (
 	"math/rand"
 	"os"
 	"reflect"
-	"sync"
 	"syscall"
 	"testing"
 	"time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/control/server"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/runsc/fsgofer"
 )
@@ -100,20 +103,29 @@ func startGofer(root string) (int, func(), error) {
 	return sandboxEnd, cleanup, nil
 }
 
-func createLoader() (*Loader, func(), error) {
+func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) {
 	fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
 	if err != nil {
 		return nil, nil, err
 	}
 	conf := testConfig()
-	spec := testSpec()
+	conf.VFS2 = vfsEnabled
 
 	sandEnd, cleanup, err := startGofer(spec.Root.Path)
 	if err != nil {
 		return nil, nil, err
 	}
 
-	stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())}
+	// Loader takes ownership of stdio.
+	var stdio []int
+	for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
+		newFd, err := unix.Dup(int(f.Fd()))
+		if err != nil {
+			return nil, nil, err
+		}
+		stdio = append(stdio, newFd)
+	}
+
 	args := Args{
 		ID:           "foo",
 		Spec:         spec,
@@ -132,10 +144,20 @@ func createLoader() (*Loader, func(), error) {
 
 // TestRun runs a simple application in a sandbox and checks that it succeeds.
 func TestRun(t *testing.T) {
-	l, cleanup, err := createLoader()
+	doRun(t, false)
+}
+
+// TestRunVFS2 runs TestRun in VFSv2.
+func TestRunVFS2(t *testing.T) {
+	doRun(t, true)
+}
+
+func doRun(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
+
 	defer l.Destroy()
 	defer cleanup()
 
@@ -169,7 +191,16 @@ func TestRun(t *testing.T) {
 // TestStartSignal tests that the controller Start message will cause
 // WaitForStartSignal to return.
 func TestStartSignal(t *testing.T) {
-	l, cleanup, err := createLoader()
+	doStartSignal(t, false)
+}
+
+// TestStartSignalVFS2 does TestStartSignal with VFS2.
+func TestStartSignalVFS2(t *testing.T) {
+	doStartSignal(t, true)
+}
+
+func doStartSignal(t *testing.T, vfsEnabled bool) {
+	l, cleanup, err := createLoader(vfsEnabled, testSpec())
 	if err != nil {
 		t.Fatalf("error creating loader: %v", err)
 	}
@@ -217,18 +248,19 @@ func TestStartSignal(t *testing.T) {
 
 }
 
-// Test that MountNamespace can be created with various specs.
-func TestCreateMountNamespace(t *testing.T) {
-	testCases := []struct {
-		name string
-		// Spec that will be used to create the mount manager.  Note
-		// that we can't mount procfs without a kernel, so each spec
-		// MUST contain something other than procfs mounted at /proc.
-		spec specs.Spec
-		// Paths that are expected to exist in the resulting fs.
-		expectedPaths []string
-	}{
-		{
+type CreateMountTestcase struct {
+	name string
+	// Spec that will be used to create the mount manager.  Note
+	// that we can't mount procfs without a kernel, so each spec
+	// MUST contain something other than procfs mounted at /proc.
+	spec specs.Spec
+	// Paths that are expected to exist in the resulting fs.
+	expectedPaths []string
+}
+
+func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+	testCases := []*CreateMountTestcase{
+		&CreateMountTestcase{
 			// Only proc.
 			name: "only proc mount",
 			spec: specs.Spec{
@@ -270,7 +302,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			// /dev, and /sys.
 			expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		{
+		&CreateMountTestcase{
 			// Mounts are nested inside each other.
 			name: "nested mounts",
 			spec: specs.Spec{
@@ -314,7 +346,7 @@ func TestCreateMountNamespace(t *testing.T) {
 			expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
 				"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
 		},
-		{
+		&CreateMountTestcase{
 			name: "mount inside /dev",
 			spec: specs.Spec{
 				Root: &specs.Root{
@@ -357,40 +389,46 @@ func TestCreateMountNamespace(t *testing.T) {
 			},
 			expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
 		},
-		{
-			name: "mounts inside mandatory mounts",
-			spec: specs.Spec{
-				Root: &specs.Root{
-					Path:     os.TempDir(),
-					Readonly: true,
+	}
+
+	vfsCase := &CreateMountTestcase{
+		name: "mounts inside mandatory mounts",
+		spec: specs.Spec{
+			Root: &specs.Root{
+				Path:     os.TempDir(),
+				Readonly: true,
+			},
+			Mounts: []specs.Mount{
+				{
+					Destination: "/proc",
+					Type:        "tmpfs",
 				},
-				Mounts: []specs.Mount{
-					{
-						Destination: "/proc",
-						Type:        "tmpfs",
-					},
-					// We don't include /sys, and /tmp in
-					// the spec, since they will be added
-					// automatically.
-					//
-					// Instead, add submounts inside these
-					// directories and make sure they are
-					// visible under the mandatory mounts.
-					{
-						Destination: "/sys/bar",
-						Type:        "tmpfs",
-					},
-					{
-						Destination: "/tmp/baz",
-						Type:        "tmpfs",
-					},
+				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
+				//  MkDirAt in VFS2 (and remove the reduntant append).
+				// {
+				//		Destination: "/sys/bar",
+				//		Type:        "tmpfs",
+				//	},
+				//
+				{
+					Destination: "/tmp/baz",
+					Type:        "tmpfs",
 				},
 			},
-			expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
 		},
+		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
 	}
 
-	for _, tc := range testCases {
+	if !vfs2 {
+		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
+		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
+	}
+	return append(testCases, vfsCase)
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespace(t *testing.T) {
+	for _, tc := range createMountTestcases(false /* vfs2 */) {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
@@ -412,13 +450,59 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 
 			root := mns.Root()
-			defer root.DecRef()
+			defer root.DecRef(ctx)
 			for _, p := range tc.expectedPaths {
 				maxTraversals := uint(0)
 				if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
 					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
 				} else {
-					d.DecRef()
+					d.DecRef(ctx)
+				}
+			}
+		})
+	}
+}
+
+// Test that MountNamespace can be created with various specs.
+func TestCreateMountNamespaceVFS2(t *testing.T) {
+	for _, tc := range createMountTestcases(true /* vfs2 */) {
+		t.Run(tc.name, func(t *testing.T) {
+			spec := testSpec()
+			spec.Mounts = tc.spec.Mounts
+			spec.Root = tc.spec.Root
+
+			t.Logf("Using root: %q", spec.Root.Path)
+			l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec)
+			if err != nil {
+				t.Fatalf("failed to create loader: %v", err)
+			}
+			defer l.Destroy()
+			defer loaderCleanup()
+
+			mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints)
+			if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil {
+				t.Fatalf("failed process hints: %v", err)
+			}
+
+			ctx := l.k.SupervisorContext()
+			mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs)
+			if err != nil {
+				t.Fatalf("failed to setupVFS2: %v", err)
+			}
+
+			root := mns.Root()
+			defer root.DecRef(ctx)
+			for _, p := range tc.expectedPaths {
+				target := &vfs.PathOperation{
+					Root:  root,
+					Start: root,
+					Path:  fspath.Parse(p),
+				}
+
+				if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+					t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
+				} else {
+					d.DecRef(ctx)
 				}
 			}
 		})
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index f98c5fd36..4e1fa7665 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -17,12 +17,16 @@ package boot
 import (
 	"fmt"
 	"net"
+	"runtime"
+	"strings"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
+	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
 	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -31,6 +35,32 @@ import (
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
+var (
+	// DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and
+	// "::1/8" on "lo" interface.
+	DefaultLoopbackLink = LoopbackLink{
+		Name: "lo",
+		Addresses: []net.IP{
+			net.IP("\x7f\x00\x00\x01"),
+			net.IPv6loopback,
+		},
+		Routes: []Route{
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv4(0x7f, 0, 0, 0),
+					Mask: net.IPv4Mask(0xff, 0, 0, 0),
+				},
+			},
+			{
+				Destination: net.IPNet{
+					IP:   net.IPv6loopback,
+					Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)),
+				},
+			},
+		},
+	}
+)
+
 // Network exposes methods that can be used to configure a network stack.
 type Network struct {
 	Stack *stack.Stack
@@ -48,6 +78,44 @@ type DefaultRoute struct {
 	Name  string
 }
 
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+	// QDiscNone disables any queueing for the underlying FD.
+	QDiscNone QueueingDiscipline = iota
+
+	// QDiscFIFO applies a simple fifo based queue to the underlying
+	// FD.
+	QDiscFIFO
+)
+
+// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
+// else returns an error.
+func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
+	switch s {
+	case "none":
+		return QDiscNone, nil
+	case "fifo":
+		return QDiscFIFO, nil
+	default:
+		return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
+	}
+}
+
+// String implements fmt.Stringer.
+func (q QueueingDiscipline) String() string {
+	switch q {
+	case QDiscNone:
+		return "none"
+	case QDiscFIFO:
+		return "fifo"
+	default:
+		panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
+	}
+}
+
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
 	Name               string
@@ -56,7 +124,10 @@ type FDBasedLink struct {
 	Routes             []Route
 	GSOMaxSize         uint32
 	SoftwareGSOEnabled bool
+	TXChecksumOffload  bool
+	RXChecksumOffload  bool
 	LinkAddress        net.HardwareAddr
+	QDisc              QueueingDiscipline
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -80,7 +151,8 @@ type CreateLinksAndRoutesArgs struct {
 	LoopbackLinks []LoopbackLink
 	FDBasedLinks  []FDBasedLink
 
-	DefaultGateway DefaultRoute
+	Defaultv4Gateway DefaultRoute
+	Defaultv6Gateway DefaultRoute
 }
 
 // Empty returns true if route hasn't been set.
@@ -122,10 +194,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		nicID++
 		nicids[link.Name] = nicID
 
-		ep := loopback.New()
+		linkEP := loopback.New()
 
 		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-		if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
 		}
 
@@ -157,7 +229,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		mac := tcpip.LinkAddress(link.LinkAddress)
-		ep, err := fdbased.New(&fdbased.Options{
+		log.Infof("gso max size is: %d", link.GSOMaxSize)
+
+		linkEP, err := fdbased.New(&fdbased.Options{
 			FDs:                FDs,
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
@@ -165,14 +239,25 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			PacketDispatchMode: fdbased.RecvMMsg,
 			GSOMaxSize:         link.GSOMaxSize,
 			SoftwareGSOEnabled: link.SoftwareGSOEnabled,
-			RXChecksumOffload:  true,
+			TXChecksumOffload:  link.TXChecksumOffload,
+			RXChecksumOffload:  link.RXChecksumOffload,
 		})
 		if err != nil {
 			return err
 		}
 
+		switch link.QDisc {
+		case QDiscNone:
+		case QDiscFIFO:
+			log.Infof("Enabling FIFO QDisc on %q", link.Name)
+			linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
+		}
+
+		// Enable support for AF_PACKET sockets to receive outgoing packets.
+		linkEP = packetsocket.New(linkEP)
+
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
-		if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
 			return err
 		}
 
@@ -186,12 +271,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 	}
 
-	if !args.DefaultGateway.Route.Empty() {
-		nicID, ok := nicids[args.DefaultGateway.Name]
+	if !args.Defaultv4Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv4Gateway.Name]
 		if !ok {
-			return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
 		}
-		route, err := args.DefaultGateway.Route.toTcpipRoute(nicID)
+		route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
+	}
+
+	if !args.Defaultv6Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv6Gateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
+		}
+		route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
 		if err != nil {
 			return err
 		}
@@ -205,15 +302,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 // createNICWithAddrs creates a NIC in the network stack and adds the given
 // addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error {
-	if loopback {
-		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil {
-			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err)
-		}
-	} else {
-		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
-			return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err)
-		}
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error {
+	opts := stack.NICOptions{Name: name}
+	if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil {
+		return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
 	}
 
 	// Always start with an arp address for the NIC.
diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD
index 03391cdca..77774f43c 100644
--- a/runsc/boot/platforms/BUILD
+++ b/runsc/boot/platforms/BUILD
@@ -1,11 +1,10 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools:defs.bzl", "go_library")
 
 package(licenses = ["notice"])
 
 go_library(
     name = "platforms",
     srcs = ["platforms.go"],
-    importpath = "gvisor.dev/gvisor/runsc/boot/platforms",
     visibility = [
         "//runsc:__subpackages__",
     ],
diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD
new file mode 100644
index 000000000..29cb42b2f
--- /dev/null
+++ b/runsc/boot/pprof/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pprof",
+    srcs = ["pprof.go"],
+    visibility = [
+        "//runsc:__subpackages__",
+    ],
+)
diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof/pprof.go
index 463362f02..1ded20dee 100644
--- a/runsc/boot/pprof.go
+++ b/runsc/boot/pprof/pprof.go
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package boot
+// Package pprof provides a stub to initialize custom profilers.
+package pprof
 
-func initializePProf() {
+// Initialize will be called at boot for initializing custom profilers.
+func Initialize() {
 }
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
deleted file mode 100644
index 56cc12ee0..000000000
--- a/runsc/boot/user.go
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"bufio"
-	"fmt"
-	"io"
-	"strconv"
-	"strings"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-)
-
-type fileReader struct {
-	// Ctx is the context for the file reader.
-	Ctx context.Context
-
-	// File is the file to read from.
-	File *fs.File
-}
-
-// Read implements io.Reader.Read.
-func (r *fileReader) Read(buf []byte) (int, error) {
-	n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf))
-	return int(n), err
-}
-
-// getExecUserHome returns the home directory of the executing user read from
-// /etc/passwd as read from the container filesystem.
-func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) {
-	// The default user home directory to return if no user matching the user
-	// if found in the /etc/passwd found in the image.
-	const defaultHome = "/"
-
-	// Open the /etc/passwd file from the dirent via the root mount namespace.
-	mnsRoot := rootMns.Root()
-	maxTraversals := uint(linux.MaxSymlinkTraversals)
-	dirent, err := rootMns.FindInode(ctx, mnsRoot, nil, "/etc/passwd", &maxTraversals)
-	if err != nil {
-		// NOTE: Ignore errors opening the passwd file. If the passwd file
-		// doesn't exist we will return the default home directory.
-		return defaultHome, nil
-	}
-	defer dirent.DecRef()
-
-	// Check read permissions on the file.
-	if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil {
-		// NOTE: Ignore permissions errors here and return default root dir.
-		return defaultHome, nil
-	}
-
-	// Only open regular files. We don't open other files like named pipes as
-	// they may block and might present some attack surface to the container.
-	// Note that runc does not seem to do this kind of checking.
-	if !fs.IsRegular(dirent.Inode.StableAttr) {
-		return defaultHome, nil
-	}
-
-	f, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Directory: false})
-	if err != nil {
-		return "", err
-	}
-	defer f.DecRef()
-
-	r := &fileReader{
-		Ctx:  ctx,
-		File: f,
-	}
-
-	homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
-	if err != nil {
-		return "", err
-	}
-
-	return homeDir, nil
-}
-
-// maybeAddExecUserHome returns a new slice with the HOME enviroment variable
-// set if the slice does not already contain it, otherwise it returns the
-// original slice unmodified.
-func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
-	// Check if the envv already contains HOME.
-	for _, env := range envv {
-		if strings.HasPrefix(env, "HOME=") {
-			// We have it. Return the original slice unmodified.
-			return envv, nil
-		}
-	}
-
-	// Read /etc/passwd for the user's HOME directory and set the HOME
-	// environment variable as required by POSIX if it is not overridden by
-	// the user.
-	homeDir, err := getExecUserHome(ctx, mns, uid)
-	if err != nil {
-		return nil, fmt.Errorf("error reading exec user: %v", err)
-	}
-	return append(envv, "HOME="+homeDir), nil
-}
-
-// findHomeInPasswd parses a passwd file and returns the given user's home
-// directory. This function does it's best to replicate the runc's behavior.
-func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) {
-	s := bufio.NewScanner(passwd)
-
-	for s.Scan() {
-		if err := s.Err(); err != nil {
-			return "", err
-		}
-
-		line := strings.TrimSpace(s.Text())
-		if line == "" {
-			continue
-		}
-
-		// Pull out part of passwd entry. Loosely parse the passwd entry as some
-		// passwd files could be poorly written and for compatibility with runc.
-		//
-		// Per 'man 5 passwd'
-		// /etc/passwd contains one line for each user account, with seven
-		// fields delimited by colons (“:”). These fields are:
-		//
-		// - login name
-		// - optional encrypted password
-		// - numerical user ID
-		// - numerical group ID
-		// - user name or comment field
-		// - user home directory
-		// - optional user command interpreter
-		parts := strings.Split(line, ":")
-
-		found := false
-		homeDir := ""
-		for i, p := range parts {
-			switch i {
-			case 2:
-				parsedUID, err := strconv.ParseUint(p, 10, 32)
-				if err == nil && parsedUID == uint64(uid) {
-					found = true
-				}
-			case 5:
-				homeDir = p
-			}
-		}
-		if found {
-			// NOTE: If the uid is present but the home directory is not
-			// present in the /etc/passwd entry we return an empty string. This
-			// is, for better or worse, what runc does.
-			return homeDir, nil
-		}
-	}
-
-	return defaultHome, nil
-}
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
deleted file mode 100644
index 9aee2ad07..000000000
--- a/runsc/boot/user_test.go
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"strings"
-	"syscall"
-	"testing"
-
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-)
-
-func setupTempDir() (string, error) {
-	tmpDir, err := ioutil.TempDir(os.TempDir(), "exec-user-test")
-	if err != nil {
-		return "", err
-	}
-	return tmpDir, nil
-}
-
-func setupPasswd(contents string, perms os.FileMode) func() (string, error) {
-	return func() (string, error) {
-		tmpDir, err := setupTempDir()
-		if err != nil {
-			return "", err
-		}
-
-		if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-			return "", err
-		}
-
-		f, err := os.Create(filepath.Join(tmpDir, "etc", "passwd"))
-		if err != nil {
-			return "", err
-		}
-		defer f.Close()
-
-		_, err = f.WriteString(contents)
-		if err != nil {
-			return "", err
-		}
-
-		err = f.Chmod(perms)
-		if err != nil {
-			return "", err
-		}
-		return tmpDir, nil
-	}
-}
-
-// TestGetExecUserHome tests the getExecUserHome function.
-func TestGetExecUserHome(t *testing.T) {
-	tests := map[string]struct {
-		uid        auth.KUID
-		createRoot func() (string, error)
-		expected   string
-	}{
-		"success": {
-			uid:        1000,
-			createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0666),
-			expected:   "/home/adin",
-		},
-		"no_passwd": {
-			uid:        1000,
-			createRoot: setupTempDir,
-			expected:   "/",
-		},
-		"no_perms": {
-			uid:        1000,
-			createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0000),
-			expected:   "/",
-		},
-		"directory": {
-			uid: 1000,
-			createRoot: func() (string, error) {
-				tmpDir, err := setupTempDir()
-				if err != nil {
-					return "", err
-				}
-
-				if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-					return "", err
-				}
-
-				if err := syscall.Mkdir(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil {
-					return "", err
-				}
-
-				return tmpDir, nil
-			},
-			expected: "/",
-		},
-		// Currently we don't allow named pipes.
-		"named_pipe": {
-			uid: 1000,
-			createRoot: func() (string, error) {
-				tmpDir, err := setupTempDir()
-				if err != nil {
-					return "", err
-				}
-
-				if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
-					return "", err
-				}
-
-				if err := syscall.Mkfifo(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil {
-					return "", err
-				}
-
-				return tmpDir, nil
-			},
-			expected: "/",
-		},
-	}
-
-	for name, tc := range tests {
-		t.Run(name, func(t *testing.T) {
-			tmpDir, err := tc.createRoot()
-			if err != nil {
-				t.Fatalf("failed to create root dir: %v", err)
-			}
-
-			sandEnd, cleanup, err := startGofer(tmpDir)
-			if err != nil {
-				t.Fatalf("failed to create gofer: %v", err)
-			}
-			defer cleanup()
-
-			ctx := contexttest.Context(t)
-			conf := &Config{
-				RootDir:        "unused_root_dir",
-				Network:        NetworkNone,
-				DisableSeccomp: true,
-			}
-
-			spec := &specs.Spec{
-				Root: &specs.Root{
-					Path:     tmpDir,
-					Readonly: true,
-				},
-				// Add /proc mount as tmpfs to avoid needing a kernel.
-				Mounts: []specs.Mount{
-					{
-						Destination: "/proc",
-						Type:        "tmpfs",
-					},
-				},
-			}
-
-			mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{})
-			mns, err := mntr.createMountNamespace(ctx, conf)
-			if err != nil {
-				t.Fatalf("failed to create mount namespace: %v", err)
-			}
-			ctx = fs.WithRoot(ctx, mns.Root())
-			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
-				t.Fatalf("failed to create mount namespace: %v", err)
-			}
-
-			got, err := getExecUserHome(ctx, mns, tc.uid)
-			if err != nil {
-				t.Fatalf("failed to get user home: %v", err)
-			}
-
-			if got != tc.expected {
-				t.Fatalf("expected %v, got: %v", tc.expected, got)
-			}
-		})
-	}
-}
-
-// TestFindHomeInPasswd tests the findHomeInPasswd function's passwd file parsing.
-func TestFindHomeInPasswd(t *testing.T) {
-	tests := map[string]struct {
-		uid      uint32
-		passwd   string
-		expected string
-		def      string
-	}{
-		"empty": {
-			uid:      1000,
-			passwd:   "",
-			expected: "/",
-			def:      "/",
-		},
-		"whitespace": {
-			uid:      1000,
-			passwd:   "       ",
-			expected: "/",
-			def:      "/",
-		},
-		"full": {
-			uid:      1000,
-			passwd:   "adin::1000:1111::/home/adin:/bin/sh",
-			expected: "/home/adin",
-			def:      "/",
-		},
-		// For better or worse, this is how runc works.
-		"partial": {
-			uid:      1000,
-			passwd:   "adin::1000:1111:",
-			expected: "",
-			def:      "/",
-		},
-		"multiple": {
-			uid:      1001,
-			passwd:   "adin::1000:1111::/home/adin:/bin/sh\nian::1001:1111::/home/ian:/bin/sh",
-			expected: "/home/ian",
-			def:      "/",
-		},
-		"duplicate": {
-			uid:      1000,
-			passwd:   "adin::1000:1111::/home/adin:/bin/sh\nian::1000:1111::/home/ian:/bin/sh",
-			expected: "/home/adin",
-			def:      "/",
-		},
-		"empty_lines": {
-			uid:      1001,
-			passwd:   "adin::1000:1111::/home/adin:/bin/sh\n\n\nian::1001:1111::/home/ian:/bin/sh",
-			expected: "/home/ian",
-			def:      "/",
-		},
-	}
-
-	for name, tc := range tests {
-		t.Run(name, func(t *testing.T) {
-			got, err := findHomeInPasswd(tc.uid, strings.NewReader(tc.passwd), tc.def)
-			if err != nil {
-				t.Fatalf("error parsing passwd: %v", err)
-			}
-			if tc.expected != got {
-				t.Fatalf("expected %v, got: %v", tc.expected, got)
-			}
-		})
-	}
-}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
new file mode 100644
index 000000000..08dce8b6c
--- /dev/null
+++ b/runsc/boot/vfs.go
@@ -0,0 +1,519 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+	"path"
+	"sort"
+	"strings"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+	"gvisor.dev/gvisor/pkg/sentry/devices/ttydev"
+	"gvisor.dev/gvisor/pkg/sentry/devices/tundev"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func registerFilesystems(k *kernel.Kernel) error {
+	ctx := k.SupervisorContext()
+	creds := auth.NewRootCredentials(k.RootUserNamespace())
+	vfsObj := k.VFS()
+
+	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+		// TODO(b/29356795): Users may mount this once the terminals are in a
+		//  usable state.
+		AllowUserMount: false,
+	})
+	vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList: true,
+	})
+	vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	// Setup files in devtmpfs.
+	if err := memdev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering memdev: %w", err)
+	}
+	if err := ttydev.Register(vfsObj); err != nil {
+		return fmt.Errorf("registering ttydev: %w", err)
+	}
+	tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
+	if tunSupported {
+		if err := tundev.Register(vfsObj); err != nil {
+			return fmt.Errorf("registering tundev: %v", err)
+		}
+	}
+
+	if kernel.FUSEEnabled {
+		if err := fuse.Register(vfsObj); err != nil {
+			return fmt.Errorf("registering fusedev: %w", err)
+		}
+	}
+
+	a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
+	if err != nil {
+		return fmt.Errorf("creating devtmpfs accessor: %w", err)
+	}
+	defer a.Release(ctx)
+
+	if err := a.UserspaceInit(ctx); err != nil {
+		return fmt.Errorf("initializing userspace: %w", err)
+	}
+	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating memdev devtmpfs files: %w", err)
+	}
+	if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
+		return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
+	}
+	if tunSupported {
+		if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
+			return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+		}
+	}
+
+	if kernel.FUSEEnabled {
+		if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
+			return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+		}
+	}
+
+	return nil
+}
+
+func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+	if err != nil {
+		return fmt.Errorf("failed to setupFS: %w", err)
+	}
+	procArgs.MountNamespaceVFS2 = mns
+
+	// Resolve the executable path from working dir and environment.
+	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
+	if err != nil {
+		return err
+	}
+	procArgs.Filename = resolved
+	return nil
+}
+
+func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+	log.Infof("Configuring container's file system with VFS2")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = rootCreds
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := procArgs.NewContext(c.k)
+
+	mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount namespace: %w", err)
+	}
+	rootProcArgs.MountNamespaceVFS2 = mns
+
+	// Mount submounts.
+	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
+		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+	fd := c.fds.remove()
+	opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+
+	if conf.OverlayfsStaleRead {
+		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
+		// can only send mount options for specs.Mounts (specs.Root is missing
+		// Options field). So assume root is always on top of overlayfs.
+		opts = append(opts, "overlayfs_stale_read")
+	}
+
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
+		Data: strings.Join(opts, ","),
+	})
+	if err != nil {
+		return nil, fmt.Errorf("setting up mount namespace: %w", err)
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+	mounts, err := c.prepareMountsVFS2()
+	if err != nil {
+		return err
+	}
+
+	for i := range mounts {
+		submount := &mounts[i]
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+		if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+				return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+			}
+		}
+	}
+
+	if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
+		return fmt.Errorf(`mount submount "\tmp": %w`, err)
+	}
+	return nil
+}
+
+type mountAndFD struct {
+	specs.Mount
+	fd int
+}
+
+func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
+	// Associate bind mounts with their FDs before sorting since there is an
+	// undocumented assumption that FDs are dispensed in the order in which
+	// they are required by mounts.
+	var mounts []mountAndFD
+	for _, m := range c.mounts {
+		fd := -1
+		// Only bind mounts use host FDs; see
+		// containerMounter.getMountNameAndOptionsVFS2.
+		if m.Type == bind {
+			fd = c.fds.remove()
+		}
+		mounts = append(mounts, mountAndFD{
+			Mount: m,
+			fd:    fd,
+		})
+	}
+	if err := c.checkDispenser(); err != nil {
+		return nil, err
+	}
+
+	// Sort the mounts so that we don't place children before parents.
+	sort.Slice(mounts, func(i, j int) bool {
+		return len(mounts[i].Destination) < len(mounts[j].Destination)
+	})
+
+	return mounts, nil
+}
+
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
+	root := mns.Root()
+	defer root.DecRef(ctx)
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(submount.Destination),
+	}
+	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+	if err != nil {
+		return fmt.Errorf("mountOptions failed: %w", err)
+	}
+	if len(fsName) == 0 {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
+		return err
+	}
+	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
+		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+	}
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
+	return nil
+}
+
+// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+	fsName := m.Type
+	var data []string
+
+	// Find filesystem name and FS specific data field.
+	switch m.Type {
+	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
+		// Nothing to do.
+
+	case nonefs:
+		fsName = sys.Name
+
+	case tmpfs.Name:
+		var err error
+		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
+		if err != nil {
+			return "", nil, err
+		}
+
+	case bind:
+		fsName = gofer.Name
+		if m.fd == 0 {
+			// Check that an FD was provided to fails fast. Technically FD=0 is valid,
+			// but unlikely to be correct in this context.
+			return "", nil, fmt.Errorf("9P mount requires a connection FD")
+		}
+		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+
+	default:
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+		return "", nil, nil
+	}
+
+	opts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+		},
+		InternalMount: true,
+	}
+
+	for _, o := range m.Options {
+		switch o {
+		case "rw":
+			opts.ReadOnly = false
+		case "ro":
+			opts.ReadOnly = true
+		case "noatime":
+			opts.Flags.NoATime = true
+		case "noexec":
+			opts.Flags.NoExec = true
+		default:
+			log.Warningf("ignoring unknown mount option %q", o)
+		}
+	}
+
+	if conf.Overlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		opts.ReadOnly = true
+	}
+	return fsName, opts, nil
+}
+
+func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(currentPath),
+	}
+	_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
+	if err == nil {
+		log.Debugf("Mount point %q already exists", currentPath)
+		return nil
+	}
+	if err != syserror.ENOENT {
+		return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
+	}
+
+	// Recurse to ensure parent is created and then create the mount point.
+	if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
+		return err
+	}
+	log.Debugf("Creating dir %q for mount point", currentPath)
+	mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
+	if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
+		return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
+	}
+	return nil
+}
+
+// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+//   1. /tmp is mounted explicitly: we should not override user's wish
+//   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+	for _, m := range c.mounts {
+		// m.Destination has been cleaned, so it's to use equality here.
+		if m.Destination == "/tmp" {
+			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
+			return nil
+		}
+	}
+
+	root := mns.Root()
+	defer root.DecRef(ctx)
+	pop := vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse("/tmp"),
+	}
+	// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
+	fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
+	switch err {
+	case nil:
+		defer fd.DecRef(ctx)
+
+		err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name != "." && dirent.Name != ".." {
+				return syserror.ENOTEMPTY
+			}
+			return nil
+		}))
+		switch err {
+		case nil:
+			log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+		case syserror.ENOTEMPTY:
+			// If more than "." and ".." is found, skip internal tmpfs to prevent
+			// hiding existing files.
+			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
+			return nil
+		default:
+			return err
+		}
+		fallthrough
+
+	case syserror.ENOENT:
+		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
+		// tmpfs.
+		tmpMount := specs.Mount{
+			Type:        tmpfs.Name,
+			Destination: "/tmp",
+			// Sticky bit is added to prevent accidental deletion of files from
+			// another user. This is normally done for /tmp.
+			Options: []string{"mode=01777"},
+		}
+		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+
+	case syserror.ENOTDIR:
+		// Not a dir?! Let it be.
+		return nil
+
+	default:
+		return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
+	}
+}
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error {
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs.Name {
+			continue
+		}
+
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.vfsMount = mnt
+	}
+	return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	mntFD := &mountAndFD{Mount: hint.mount}
+	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+	if err := source.checkCompatible(mount); err != nil {
+		return err
+	}
+
+	_, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+	if err != nil {
+		return err
+	}
+	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+	if err != nil {
+		return err
+	}
+	defer newMnt.DecRef(ctx)
+
+	root := mns.Root()
+	defer root.DecRef(ctx)
+	if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil {
+		return err
+	}
+
+	target := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(mount.Destination),
+	}
+	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+		return err
+	}
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return nil
+}
author	Ian Lewis <ianmlewis@gmail.com>	2020-08-17 21:44:31 -0400
committer	Ian Lewis <ianmlewis@gmail.com>	2020-08-17 21:44:31 -0400
commit	ac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree	0cbc5018e8807421d701d190dc20525726c7ca76 /runsc/boot
parent	352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent	6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)