59 files changed, 2717 insertions, 901 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
index a2a465e1e..e5587421d 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,7 +1,7 @@
 package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
-load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar")
+load("@rules_pkg//:pkg.bzl", "pkg_deb", "pkg_tar")
 
 go_binary(
     name = "runsc",
@@ -13,7 +13,7 @@ go_binary(
     visibility = [
         "//visibility:public",
     ],
-    x_defs = {"main.version": "{VERSION}"},
+    x_defs = {"main.version": "{STABLE_VERSION}"},
     deps = [
         "//pkg/log",
         "//pkg/refs",
@@ -46,7 +46,7 @@ go_binary(
     visibility = [
         "//visibility:public",
     ],
-    x_defs = {"main.version": "{VERSION}"},
+    x_defs = {"main.version": "{STABLE_VERSION}"},
     deps = [
         "//pkg/log",
         "//pkg/refs",
@@ -76,28 +76,38 @@ pkg_tar(
 
 genrule(
     name = "deb-version",
+    # Note that runsc must appear in the srcs parameter and not the tools
+    # parameter, otherwise it will not be stamped. This is reasonable, as tools
+    # may be encoded differently in the build graph (cached more aggressively
+    # because they are assumes to be hermetic).
+    srcs = [":runsc"],
     outs = ["version.txt"],
     cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@",
     stamp = 1,
-    tools = [":runsc"],
 )
 
 pkg_deb(
     name = "runsc-debian",
     architecture = "amd64",
     data = ":debian-data",
+    # Note that the description_file will be flatten (all newlines removed),
+    # and therefore it is kept to a simple one-line description. The expected
+    # format for debian packages is "short summary\nLonger explanation of
+    # tool." and this is impossible with the flattening.
     description_file = "debian/description",
     homepage = "https://gvisor.dev/",
     maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
     package = "runsc",
     postinst = "debian/postinst.sh",
-    tags = [
-        # TODO(b/135475885): pkg_deb requires python2:
-        # https://github.com/bazelbuild/bazel/issues/8443
-        "manual",
-    ],
     version_file = ":version.txt",
     visibility = [
         "//visibility:public",
     ],
 )
+
+sh_test(
+    name = "version_test",
+    size = "small",
+    srcs = ["version_test.sh"],
+    data = [":runsc"],
+)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 588bb8851..6226b63f8 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "compat.go",
         "compat_amd64.go",
+        "compat_arm64.go",
         "config.go",
         "controller.go",
         "debug.go",
@@ -15,6 +16,8 @@ go_library(
         "fs.go",
         "limits.go",
         "loader.go",
+        "loader_amd64.go",
+        "loader_arm64.go",
         "network.go",
         "pprof.go",
         "strace.go",
@@ -57,10 +60,11 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/sighandling",
-        "//pkg/sentry/socket/epsocket",
         "//pkg/sentry/socket/hostinet",
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/netlink/uevent",
+        "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
@@ -80,6 +84,7 @@ go_library(
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/raw",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/urpc",
@@ -106,9 +111,9 @@ go_test(
         "//pkg/control/server",
         "//pkg/log",
         "//pkg/p9",
-        "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
         "//pkg/unet",
         "//runsc/fsgofer",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 07e35ab10..352e710d2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -21,10 +21,8 @@ import (
 	"syscall"
 
 	"github.com/golang/protobuf/proto"
-	"gvisor.dev/gvisor/pkg/abi"
 	"gvisor.dev/gvisor/pkg/eventchannel"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 	ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
 	"gvisor.dev/gvisor/pkg/sentry/strace"
@@ -53,9 +51,9 @@ type compatEmitter struct {
 }
 
 func newCompatEmitter(logFD int) (*compatEmitter, error) {
-	nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+	nameMap, ok := getSyscallNameMap()
 	if !ok {
-		return nil, fmt.Errorf("amd64 Linux syscall table not found")
+		return nil, fmt.Errorf("Linux syscall table not found")
 	}
 
 	c := &compatEmitter{
@@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
 }
 
 func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
-	regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+	regs := us.Registers
 
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	sysnr := regs.OrigRax
+	sysnr := syscallNum(regs)
 	tr := c.trackers[sysnr]
 	if tr == nil {
 		switch sysnr {
-		case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+		case syscall.SYS_PRCTL:
 			// args: cmd, ...
 			tr = newArgsTracker(0)
 
@@ -112,10 +110,14 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 			tr = newArgsTracker(2)
 
 		default:
-			tr = &onceTracker{}
+			tr = newArchArgsTracker(sysnr)
+			if tr == nil {
+				tr = &onceTracker{}
+			}
 		}
 		c.trackers[sysnr] = tr
 	}
+
 	if tr.shouldReport(regs) {
 		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
 		tr.onReported(regs)
@@ -139,10 +141,10 @@ func (c *compatEmitter) Close() error {
 // the syscall and arguments.
 type syscallTracker interface {
 	// shouldReport returns true is the syscall should be reported.
-	shouldReport(regs *rpb.AMD64Registers) bool
+	shouldReport(regs *rpb.Registers) bool
 
 	// onReported marks the syscall as reported.
-	onReported(regs *rpb.AMD64Registers)
+	onReported(regs *rpb.Registers)
 }
 
 // onceTracker reports only a single time, used for most syscalls.
@@ -150,10 +152,45 @@ type onceTracker struct {
 	reported bool
 }
 
-func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+func (o *onceTracker) shouldReport(_ *rpb.Registers) bool {
 	return !o.reported
 }
 
-func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+func (o *onceTracker) onReported(_ *rpb.Registers) {
 	o.reported = true
 }
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+	// argsIdx is the syscall arguments to use as unique ID.
+	argsIdx  []int
+	reported map[string]struct{}
+	count    int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// key returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.Registers) string {
+	var rv string
+	for _, idx := range a.argsIdx {
+		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	}
+	return rv
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.Registers) bool {
+	if a.count >= reportLimit {
+		return false
+	}
+	_, ok := a.reported[a.key(regs)]
+	return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.Registers) {
+	a.count++
+	a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 43cd0db94..42b0ca8b0 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -16,62 +16,81 @@ package boot
 
 import (
 	"fmt"
+	"syscall"
 
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
 )
 
 // reportLimit is the max number of events that should be reported per tracker.
 const reportLimit = 100
 
-// argsTracker reports only once for each different combination of arguments.
-// It's used for generic syscalls like ioctl to report once per 'cmd'.
-type argsTracker struct {
-	// argsIdx is the syscall arguments to use as unique ID.
-	argsIdx  []int
-	reported map[string]struct{}
-	count    int
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Amd64{
+			Amd64: &rpb.AMD64Registers{},
+		},
+	}
 }
 
-func newArgsTracker(argIdx ...int) *argsTracker {
-	return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
-}
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
 
-// cmd returns the command based on the syscall argument index.
-func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
-	var rv string
-	for _, idx := range a.argsIdx {
-		rv += fmt.Sprintf("%d|", argVal(idx, regs))
+	switch argIdx {
+	case 0:
+		return uint32(amd64Regs.Rdi)
+	case 1:
+		return uint32(amd64Regs.Rsi)
+	case 2:
+		return uint32(amd64Regs.Rdx)
+	case 3:
+		return uint32(amd64Regs.R10)
+	case 4:
+		return uint32(amd64Regs.R8)
+	case 5:
+		return uint32(amd64Regs.R9)
 	}
-	return rv
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
 	switch argIdx {
 	case 0:
-		return uint32(regs.Rdi)
+		amd64Regs.Rdi = argVal
 	case 1:
-		return uint32(regs.Rsi)
+		amd64Regs.Rsi = argVal
 	case 2:
-		return uint32(regs.Rdx)
+		amd64Regs.Rdx = argVal
 	case 3:
-		return uint32(regs.R10)
+		amd64Regs.R10 = argVal
 	case 4:
-		return uint32(regs.R8)
+		amd64Regs.R8 = argVal
 	case 5:
-		return uint32(regs.R9)
+		amd64Regs.R9 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 	}
-	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
 
-func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
-	if a.count >= reportLimit {
-		return false
-	}
-	_, ok := a.reported[a.key(regs)]
-	return !ok
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.AMD64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+	return amd64Regs.OrigRax
 }
 
-func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
-	a.count++
-	a.reported[a.key(regs)] = struct{}{}
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	switch sysnr {
+	case syscall.SYS_ARCH_PRCTL:
+		// args: cmd, ...
+		return newArgsTracker(0)
+	}
+	return nil
 }
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
new file mode 100644
index 000000000..f784cd237
--- /dev/null
+++ b/runsc/boot/compat_arm64.go
@@ -0,0 +1,91 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+	"gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+	return &rpb.Registers{
+		Arch: &rpb.Registers_Arm64{
+			Arm64: &rpb.ARM64Registers{},
+		},
+	}
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		return uint32(arm64Regs.R0)
+	case 1:
+		return uint32(arm64Regs.R1)
+	case 2:
+		return uint32(arm64Regs.R2)
+	case 3:
+		return uint32(arm64Regs.R3)
+	case 4:
+		return uint32(arm64Regs.R4)
+	case 5:
+		return uint32(arm64Regs.R5)
+	}
+	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+	switch argIdx {
+	case 0:
+		arm64Regs.R0 = argVal
+	case 1:
+		arm64Regs.R1 = argVal
+	case 2:
+		arm64Regs.R2 = argVal
+	case 3:
+		arm64Regs.R3 = argVal
+	case 4:
+		arm64Regs.R4 = argVal
+	case 5:
+		arm64Regs.R5 = argVal
+	default:
+		panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+	}
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+	return strace.Lookup(abi.Linux, arch.ARM64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+	return arm64Regs.R8
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+	// currently, no arch specific syscalls need to be handled here.
+	return nil
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index 388298d8d..839c5303b 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -16,8 +16,6 @@ package boot
 
 import (
 	"testing"
-
-	rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
 )
 
 func TestOnceTracker(t *testing.T) {
@@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) {
 
 func TestArgsTracker(t *testing.T) {
 	for _, tc := range []struct {
-		name string
-		idx  []int
-		rdi1 uint64
-		rdi2 uint64
-		rsi1 uint64
-		rsi2 uint64
-		want bool
+		name   string
+		idx    []int
+		arg1_1 uint64
+		arg1_2 uint64
+		arg2_1 uint64
+		arg2_2 uint64
+		want   bool
 	}{
-		{name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false},
-		{name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false},
-		{name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true},
-		{name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true},
-		{name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
-		{name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false},
-		{name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true},
+		{name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false},
+		{name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false},
+		{name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true},
+		{name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true},
+		{name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false},
+		{name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false},
+		{name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			c := newArgsTracker(tc.idx...)
-			regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
+			regs := newRegs()
+			setArgVal(0, tc.arg1_1, regs)
+			setArgVal(1, tc.arg2_1, regs)
 			if !c.shouldReport(regs) {
 				t.Error("first call to shouldReport, got: false, want: true")
 			}
 			c.onReported(regs)
 
-			regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
+			setArgVal(0, tc.arg1_2, regs)
+			setArgVal(1, tc.arg2_2, regs)
 			if got := c.shouldReport(regs); tc.want != got {
 				t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
 			}
@@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) {
 func TestArgsTrackerLimit(t *testing.T) {
 	c := newArgsTracker(0, 1)
 	for i := 0; i < reportLimit; i++ {
-		regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)}
+		regs := newRegs()
+		setArgVal(0, 123, regs)
+		setArgVal(1, uint64(i), regs)
 		if !c.shouldReport(regs) {
 			t.Error("shouldReport before limit was reached, got: false, want: true")
 		}
@@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) {
 	}
 
 	// Should hit the count limit now.
-	regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456}
+	regs := newRegs()
+	setArgVal(0, 123, regs)
+	setArgVal(1, 123456, regs)
 	if c.shouldReport(regs) {
 		t.Error("shouldReport after limit was reached, got: true, want: false")
 	}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 05b8f8761..a878bc2ce 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -167,6 +167,9 @@ type Config struct {
 	// Overlay is whether to wrap the root filesystem in an overlay.
 	Overlay bool
 
+	// FSGoferHostUDS enables the gofer to mount a host UDS.
+	FSGoferHostUDS bool
+
 	// Network indicates what type of network to use.
 	Network NetworkType
 
@@ -175,8 +178,11 @@ type Config struct {
 	// capabilities.
 	EnableRaw bool
 
-	// GSO indicates that generic segmentation offload is enabled.
-	GSO bool
+	// HardwareGSO indicates that hardware segmentation offload is enabled.
+	HardwareGSO bool
+
+	// SoftwareGSO indicates that software segmentation offload is enabled.
+	SoftwareGSO bool
 
 	// LogPackets indicates that all network packets should be logged.
 	LogPackets bool
@@ -211,12 +217,6 @@ type Config struct {
 	// RestoreFile is the path to the saved container image
 	RestoreFile string
 
-	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
-	// tests. It allows runsc to start the sandbox process as the current
-	// user, and without chrooting the sandbox process. This can be
-	// necessary in test environments that have limited capabilities.
-	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
-
 	// NumNetworkChannels controls the number of AF_PACKET sockets that map
 	// to the same underlying network device. This allows netstack to better
 	// scale for high throughput use cases.
@@ -233,6 +233,29 @@ type Config struct {
 
 	// ReferenceLeakMode sets reference leak check mode
 	ReferenceLeakMode refs.LeakMode
+
+	// OverlayfsStaleRead causes cached FDs to reopen after a file is opened for
+	// write to workaround overlayfs limitation on kernels before 4.19.
+	OverlayfsStaleRead bool
+
+	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+	// tests. It allows runsc to start the sandbox process as the current
+	// user, and without chrooting the sandbox process. This can be
+	// necessary in test environments that have limited capabilities.
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
+	// test name in the container environment variables and adds it to the debug
+	// log file name. This is done to help identify the log with the test when
+	// multiple tests are run in parallel, since there is no way to pass
+	// parameters to the runtime from docker.
+	TestOnlyTestNameEnv string
+
+	// CPUNumFromQuota sets CPU number count to available CPU quota, using
+	// least integer value greater than or equal to quota.
+	//
+	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+	CPUNumFromQuota bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -246,6 +269,7 @@ func (c *Config) ToFlags() []string {
 		"--debug-log-format=" + c.DebugLogFormat,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
+		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
 		"--network=" + c.Network.String(),
 		"--log-packets=" + strconv.FormatBool(c.LogPackets),
 		"--platform=" + c.Platform,
@@ -260,10 +284,19 @@ func (c *Config) ToFlags() []string {
 		"--rootless=" + strconv.FormatBool(c.Rootless),
 		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
 		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
+		"--gso=" + strconv.FormatBool(c.HardwareGSO),
+		"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
+		"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
 	}
+	if c.CPUNumFromQuota {
+		f = append(f, "--cpu-num-from-quota")
+	}
+	// Only include these if set since it is never to be used by users.
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		// Only include if set since it is never to be used by users.
-		f = append(f, "-TESTONLY-unsafe-nonroot=true")
+		f = append(f, "--TESTONLY-unsafe-nonroot=true")
+	}
+	if len(c.TestOnlyTestNameEnv) != 0 {
+		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
 	}
 	return f
 }
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 72cbabd16..9c9e94864 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -18,7 +18,6 @@ import (
 	"errors"
 	"fmt"
 	"os"
-	"path"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -27,12 +26,13 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/state"
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
+	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 const (
@@ -51,7 +51,7 @@ const (
 	ContainerEvent = "containerManager.Event"
 
 	// ContainerExecuteAsync is the URPC endpoint for executing a command in a
-	// container..
+	// container.
 	ContainerExecuteAsync = "containerManager.ExecuteAsync"
 
 	// ContainerPause pauses the container.
@@ -142,7 +142,7 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 	srv.Register(manager)
 
-	if eps, ok := l.k.NetworkStack().(*epsocket.Stack); ok {
+	if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
@@ -152,7 +152,9 @@ func newController(fd int, l *Loader) (*controller, error) {
 	srv.Register(&debug{})
 	srv.Register(&control.Logging{})
 	if l.conf.ProfileEnable {
-		srv.Register(&control.Profile{})
+		srv.Register(&control.Profile{
+			Kernel: l.k,
+		})
 	}
 
 	return &controller{
@@ -161,7 +163,7 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}, nil
 }
 
-// containerManager manages sandboes containers.
+// containerManager manages sandbox containers.
 type containerManager struct {
 	// startChan is used to signal when the root container process should
 	// be started.
@@ -234,17 +236,13 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if args.CID == "" {
 		return errors.New("start argument missing container ID")
 	}
-	// Prevent CIDs containing ".." from confusing the sentry when creating
-	// /containers/<cid> directory.
-	// TODO(b/129293409): Once we have multiple independent roots, this
-	// check won't be necessary.
-	if path.Clean(args.CID) != args.CID {
-		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
-	}
 	if len(args.FilePayload.Files) < 4 {
 		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
 	}
 
+	// All validation passed, logs the spec for debugging.
+	specutils.LogSpec(args.Spec)
+
 	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
@@ -355,7 +353,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
-	if eps, ok := networkStack.(*epsocket.Stack); ok {
+	if eps, ok := networkStack.(*netstack.Stack); ok {
 		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
 	}
 	info, err := specFile.Stat()
@@ -384,7 +382,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	}
 
 	// Since we have a new kernel we also must make a new watchdog.
-	dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
 
 	// Change the loader fields to reflect the changes made when restoring.
 	cm.l.k = k
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index f5509b6b7..3a9dcfc04 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "filter",
     srcs = [
         "config.go",
+        "config_amd64.go",
+        "config_arm64.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 7ca776b3a..4fb9adca6 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -26,10 +26,6 @@ import (
 
 // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
 var allowedSyscalls = seccomp.SyscallRules{
-	syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-		{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	},
 	syscall.SYS_CLOCK_GETTIME: {},
 	syscall.SYS_CLONE: []seccomp.Rule{
 		{
@@ -42,8 +38,15 @@ var allowedSyscalls = seccomp.SyscallRules{
 					syscall.CLONE_THREAD),
 		},
 	},
-	syscall.SYS_CLOSE:         {},
-	syscall.SYS_DUP:           {},
+	syscall.SYS_CLOSE: {},
+	syscall.SYS_DUP:   {},
+	syscall.SYS_DUP3: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+	},
 	syscall.SYS_EPOLL_CREATE1: {},
 	syscall.SYS_EPOLL_CTL:     {},
 	syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
@@ -88,14 +91,24 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
 		},
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
 			seccomp.AllowAny{},
+		},
+		// Non-private variants are included for flipcall support. They are otherwise
+		// unncessary, as the sentry will use only private futexes internally.
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE),
 			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
 		},
 	},
 	syscall.SYS_GETPID: {},
@@ -121,11 +134,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(syscall.SOL_SOCKET),
 			seccomp.AllowValue(syscall.SO_SNDBUF),
 		},
-		{
-			seccomp.AllowAny{},
-			seccomp.AllowValue(syscall.SOL_SOCKET),
-			seccomp.AllowValue(syscall.SO_REUSEADDR),
-		},
 	},
 	syscall.SYS_GETTID:       {},
 	syscall.SYS_GETTIMEOFDAY: {},
@@ -232,6 +240,15 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(0),
 		},
 	},
+	unix.SYS_SENDMMSG: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(syscall.MSG_DONTWAIT),
+			seccomp.AllowValue(0),
+		},
+	},
 	syscall.SYS_RESTART_SYSCALL: {},
 	syscall.SYS_RT_SIGACTION:    {},
 	syscall.SYS_RT_SIGPROCMASK:  {},
@@ -295,6 +312,26 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_GETSOCKOPT: []seccomp.Rule{
 			{
 				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+			},
+			{
+				seccomp.AllowAny{},
 				seccomp.AllowValue(syscall.SOL_IPV6),
 				seccomp.AllowValue(syscall.IPV6_V6ONLY),
 			},
@@ -396,6 +433,34 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.AllowAny{},
 				seccomp.AllowValue(4),
 			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_TOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IP),
+				seccomp.AllowValue(syscall.IP_RECVTOS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_TCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
+			{
+				seccomp.AllowAny{},
+				seccomp.AllowValue(syscall.SOL_IPV6),
+				seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+				seccomp.AllowAny{},
+				seccomp.AllowValue(4),
+			},
 		},
 		syscall.SYS_SHUTDOWN: []seccomp.Rule{
 			{
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
new file mode 100644
index 000000000..5335ff82c
--- /dev/null
+++ b/runsc/boot/filter/config_amd64.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
+		seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	)
+}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
new file mode 100644
index 000000000..7fa9bbda3
--- /dev/null
+++ b/runsc/boot/filter/config_arm64.go
@@ -0,0 +1,21 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+// Reserve for future customization.
+func init() {
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 34c674840..421ccd255 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
 
 import (
 	"fmt"
-	"path"
 	"path/filepath"
 	"sort"
 	"strconv"
@@ -52,7 +51,7 @@ const (
 	rootDevice = "9pfs-/"
 
 	// MountPrefix is the annotation prefix for mount hints.
-	MountPrefix = "gvisor.dev/spec/mount"
+	MountPrefix = "dev.gvisor.spec.mount."
 
 	// Filesystems that runsc supports.
 	bind     = "bind"
@@ -64,6 +63,9 @@ const (
 	nonefs   = "none"
 )
 
+// tmpfs has some extra supported options that we must pass through.
+var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
+
 func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
 	upperFlags := lowerFlags
@@ -172,27 +174,25 @@ func p9MountOptions(fd int, fa FileAccessType) []string {
 func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
 	var out []string
 	for _, o := range opts {
-		kv := strings.Split(o, "=")
-		switch len(kv) {
-		case 1:
-			if specutils.ContainsStr(allowedKeys, o) {
-				out = append(out, o)
-				continue
-			}
-			log.Warningf("ignoring unsupported key %q", kv)
-		case 2:
-			if specutils.ContainsStr(allowedKeys, kv[0]) {
-				out = append(out, o)
-				continue
-			}
-			log.Warningf("ignoring unsupported key %q", kv[0])
-		default:
-			return nil, fmt.Errorf("invalid option %q", o)
+		ok, err := parseMountOption(o, allowedKeys...)
+		if err != nil {
+			return nil, err
+		}
+		if ok {
+			out = append(out, o)
 		}
 	}
 	return out, nil
 }
 
+func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
+	kv := strings.SplitN(opt, "=", 3)
+	if len(kv) > 2 {
+		return false, fmt.Errorf("invalid option %q", opt)
+	}
+	return specutils.ContainsStr(allowedKeys, kv[0]), nil
+}
+
 // mountDevice returns a device string based on the fs type and target
 // of the mount.
 func mountDevice(m specs.Mount) string {
@@ -207,6 +207,8 @@ func mountDevice(m specs.Mount) string {
 
 func mountFlags(opts []string) fs.MountSourceFlags {
 	mf := fs.MountSourceFlags{}
+	// Note: changes to supported options must be reflected in
+	// isSupportedMountFlag() as well.
 	for _, o := range opts {
 		switch o {
 		case "rw":
@@ -224,6 +226,18 @@ func mountFlags(opts []string) fs.MountSourceFlags {
 	return mf
 }
 
+func isSupportedMountFlag(fstype, opt string) bool {
+	switch opt {
+	case "rw", "ro", "noatime", "noexec":
+		return true
+	}
+	if fstype == tmpfs {
+		ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
+		return ok && err == nil
+	}
+	return false
+}
+
 func mustFindFilesystem(name string) fs.Filesystem {
 	fs, ok := fs.FindFilesystem(name)
 	if !ok {
@@ -427,6 +441,46 @@ func (m *mountHint) isSupported() bool {
 	return m.mount.Type == tmpfs && m.share == pod
 }
 
+// checkCompatible verifies that shared mount is compatible with master.
+// For now enforce that all options are the same. Once bind mount is properly
+// supported, then we should ensure the master is less restrictive than the
+// container, e.g. master can be 'rw' while container mounts as 'ro'.
+func (m *mountHint) checkCompatible(mount specs.Mount) error {
+	// Remove options that don't affect to mount's behavior.
+	masterOpts := filterUnsupportedOptions(m.mount)
+	slaveOpts := filterUnsupportedOptions(mount)
+
+	if len(masterOpts) != len(slaveOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	}
+
+	sort.Strings(masterOpts)
+	sort.Strings(slaveOpts)
+	for i, opt := range masterOpts {
+		if opt != slaveOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		}
+	}
+	return nil
+}
+
+func (m *mountHint) fileAccessType() FileAccessType {
+	if m.share == container {
+		return FileAccessExclusive
+	}
+	return FileAccessShared
+}
+
+func filterUnsupportedOptions(mount specs.Mount) []string {
+	rv := make([]string, 0, len(mount.Options))
+	for _, o := range mount.Options {
+		if isSupportedMountFlag(mount.Type, o) {
+			rv = append(rv, o)
+		}
+	}
+	return rv
+}
+
 // podMountHints contains a collection of mountHints for the pod.
 type podMountHints struct {
 	mounts map[string]*mountHint
@@ -435,14 +489,15 @@ type podMountHints struct {
 func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 	mnts := make(map[string]*mountHint)
 	for k, v := range spec.Annotations {
-		// Look for 'gvisor.dev/spec/mount' annotations and parse them.
+		// Look for 'dev.gvisor.spec.mount' annotations and parse them.
 		if strings.HasPrefix(k, MountPrefix) {
-			parts := strings.Split(k, "/")
-			if len(parts) != 5 {
+			// Remove the prefix and split the rest.
+			parts := strings.Split(k[len(MountPrefix):], ".")
+			if len(parts) != 2 {
 				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
 			}
-			name := parts[3]
-			if len(name) == 0 || path.Clean(name) != name {
+			name := parts[0]
+			if len(name) == 0 {
 				return nil, fmt.Errorf("invalid mount name: %s", name)
 			}
 			mnt := mnts[name]
@@ -450,7 +505,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
 				mnt = &mountHint{name: name}
 				mnts[name] = mnt
 			}
-			if err := mnt.setField(parts[4], v); err != nil {
+			if err := mnt.setField(parts[1], v); err != nil {
 				return nil, err
 			}
 		}
@@ -520,6 +575,11 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 func (c *containerMounter) processHints(conf *Config) error {
 	ctx := c.k.SupervisorContext()
 	for _, hint := range c.hints.mounts {
+		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+		// common gofer to mount all shared volumes.
+		if hint.mount.Type != tmpfs {
+			continue
+		}
 		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
 		inode, err := c.mountSharedMaster(ctx, conf, hint)
 		if err != nil {
@@ -655,6 +715,14 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
 	p9FS := mustFindFilesystem("9p")
 	opts := p9MountOptions(fd, conf.FileAccess)
+
+	if conf.OverlayfsStaleRead {
+		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
+		// can only send mount options for specs.Mounts (specs.Root is missing
+		// Options field). So assume root is always on top of overlayfs.
+		opts = append(opts, "overlayfs_stale_read")
+	}
+
 	rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
 	if err != nil {
 		return nil, fmt.Errorf("creating root mount point: %v", err)
@@ -689,7 +757,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		fsName     string
 		opts       []string
 		useOverlay bool
-		err        error
 	)
 
 	switch m.Type {
@@ -700,14 +767,16 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	case tmpfs:
 		fsName = m.Type
 
-		// tmpfs has some extra supported options that we must pass through.
-		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+		var err error
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		if err != nil {
+			return "", nil, false, err
+		}
 
 	case bind:
 		fd := c.fds.remove()
 		fsName = "9p"
-		// Non-root bind mounts are always shared.
-		opts = p9MountOptions(fd, FileAccessShared)
+		opts = p9MountOptions(fd, c.getMountAccessType(m))
 		// If configured, add overlay to all writable mounts.
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
@@ -717,7 +786,15 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		// for now.
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
-	return fsName, opts, useOverlay, err
+	return fsName, opts, useOverlay, nil
+}
+
+func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+	if hint := c.hints.findMount(mount); hint != nil {
+		return hint.fileAccessType()
+	}
+	// Non-root bind mounts are always shared if no hints were provided.
+	return FileAccessShared
 }
 
 // mountSubmount mounts volumes inside the container's root. Because mounts may
@@ -779,24 +856,15 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 		return fmt.Errorf("mount %q error: %v", m.Destination, err)
 	}
 
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
 	return nil
 }
 
 // mountSharedSubmount binds mount to a previously mounted volume that is shared
 // among containers in the same pod.
 func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
-	// For now enforce that all options are the same. Once bind mount is properly
-	// supported, then we should ensure the master is less restrictive than the
-	// container, e.g. master can be 'rw' while container mounts as 'ro'.
-	if len(mount.Options) != len(source.mount.Options) {
-		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
-	}
-	sort.Strings(mount.Options)
-	for i, opt := range mount.Options {
-		if opt != source.mount.Options[i] {
-			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
-		}
+	if err := source.checkCompatible(mount); err != nil {
+		return err
 	}
 
 	maxTraversals := uint(0)
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 49ab34b33..912037075 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -15,7 +15,6 @@
 package boot
 
 import (
-	"path"
 	"reflect"
 	"strings"
 	"testing"
@@ -26,19 +25,19 @@ import (
 func TestPodMountHintsHappy(t *testing.T) {
 	spec := &specs.Spec{
 		Annotations: map[string]string{
-			path.Join(MountPrefix, "mount1", "source"): "foo",
-			path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-			path.Join(MountPrefix, "mount1", "share"):  "pod",
+			MountPrefix + "mount1.source": "foo",
+			MountPrefix + "mount1.type":   "tmpfs",
+			MountPrefix + "mount1.share":  "pod",
 
-			path.Join(MountPrefix, "mount2", "source"):  "bar",
-			path.Join(MountPrefix, "mount2", "type"):    "bind",
-			path.Join(MountPrefix, "mount2", "share"):   "container",
-			path.Join(MountPrefix, "mount2", "options"): "rw,private",
+			MountPrefix + "mount2.source":  "bar",
+			MountPrefix + "mount2.type":    "bind",
+			MountPrefix + "mount2.share":   "container",
+			MountPrefix + "mount2.options": "rw,private",
 		},
 	}
 	podHints, err := newPodMountHints(spec)
 	if err != nil {
-		t.Errorf("newPodMountHints failed: %v", err)
+		t.Fatalf("newPodMountHints failed: %v", err)
 	}
 
 	// Check that fields were set correctly.
@@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) {
 		{
 			name: "too short",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1"): "foo",
+				MountPrefix + "mount1": "foo",
 			},
 			error: "invalid mount annotation",
 		},
 		{
 			name: "no name",
 			annotations: map[string]string{
-				MountPrefix + "//source": "foo",
+				MountPrefix + ".source": "foo",
 			},
 			error: "invalid mount name",
 		},
 		{
 			name: "missing source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "type"):  "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"): "pod",
+				MountPrefix + "mount1.type":  "tmpfs",
+				MountPrefix + "mount1.share": "pod",
 			},
 			error: "source field",
 		},
 		{
 			name: "missing type",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "type field",
 		},
 		{
 			name: "missing share",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
 			},
 			error: "share field",
 		},
 		{
 			name: "invalid field name",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "invalid"): "foo",
+				MountPrefix + "mount1.invalid": "foo",
 			},
 			error: "invalid mount annotation",
 		},
 		{
 			name: "invalid source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "source cannot be empty",
 		},
 		{
 			name: "invalid type",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "invalid-type",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "invalid-type",
+				MountPrefix + "mount1.share":  "pod",
 			},
 			error: "invalid type",
 		},
 		{
 			name: "invalid share",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "invalid-share",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "invalid-share",
 			},
 			error: "invalid share",
 		},
 		{
 			name: "invalid options",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"):  "foo",
-				path.Join(MountPrefix, "mount1", "type"):    "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):   "pod",
-				path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+				MountPrefix + "mount1.source":  "foo",
+				MountPrefix + "mount1.type":    "tmpfs",
+				MountPrefix + "mount1.share":   "pod",
+				MountPrefix + "mount1.options": "invalid-option",
 			},
 			error: "unknown mount option",
 		},
 		{
 			name: "duplicate source",
 			annotations: map[string]string{
-				path.Join(MountPrefix, "mount1", "source"): "foo",
-				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
-				path.Join(MountPrefix, "mount1", "share"):  "pod",
+				MountPrefix + "mount1.source": "foo",
+				MountPrefix + "mount1.type":   "tmpfs",
+				MountPrefix + "mount1.share":  "pod",
 
-				path.Join(MountPrefix, "mount2", "source"): "foo",
-				path.Join(MountPrefix, "mount2", "type"):   "bind",
-				path.Join(MountPrefix, "mount2", "share"):  "container",
+				MountPrefix + "mount2.source": "foo",
+				MountPrefix + "mount2.type":   "bind",
+				MountPrefix + "mount2.share":  "container",
 			},
 			error: "have the same mount source",
 		},
@@ -191,3 +190,61 @@ func TestPodMountHintsErrors(t *testing.T) {
 		})
 	}
 }
+
+func TestGetMountAccessType(t *testing.T) {
+	const source = "foo"
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		want        FileAccessType
+	}{
+		{
+			name: "container=exclusive",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
+			},
+			want: FileAccessExclusive,
+		},
+		{
+			name: "pod=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "pod",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "shared=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source,
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "shared",
+			},
+			want: FileAccessShared,
+		},
+		{
+			name: "default=shared",
+			annotations: map[string]string{
+				MountPrefix + "mount1.source": source + "mismatch",
+				MountPrefix + "mount1.type":   "bind",
+				MountPrefix + "mount1.share":  "container",
+			},
+			want: FileAccessShared,
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err != nil {
+				t.Fatalf("newPodMountHints failed: %v", err)
+			}
+			mounter := containerMounter{hints: podHints}
+			if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want {
+				t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got)
+			}
+		})
+	}
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 823a34619..bc1d0c1bb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,7 +20,6 @@ import (
 	mrand "math/rand"
 	"os"
 	"runtime"
-	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -44,7 +43,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/sighandling"
-	slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
@@ -55,6 +53,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/runsc/boot/filter"
@@ -62,10 +61,11 @@ import (
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
-	"gvisor.dev/gvisor/pkg/sentry/socket/epsocket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
 
@@ -93,10 +93,6 @@ type Loader struct {
 	// spec is the base configuration for the root container.
 	spec *specs.Spec
 
-	// startSignalForwarding enables forwarding of signals to the sandboxed
-	// container. It should be called after the init process is loaded.
-	startSignalForwarding func() func()
-
 	// stopSignalForwarding disables forwarding of signals to the sandboxed
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
@@ -146,9 +142,6 @@ type execProcess struct {
 func init() {
 	// Initialize the random number generator.
 	mrand.Seed(gtime.Now().UnixNano())
-
-	// Register the global syscall table.
-	kernel.RegisterSyscallTable(slinux.AMD64)
 }
 
 // Args are the arguments for New().
@@ -232,7 +225,7 @@ func New(args Args) (*Loader, error) {
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
 	// Run().
-	networkStack, err := newEmptyNetworkStack(args.Conf, k)
+	networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
 	if err != nil {
 		return nil, fmt.Errorf("creating network: %v", err)
 	}
@@ -300,7 +293,9 @@ func New(args Args) (*Loader, error) {
 	}
 
 	// Create a watchdog.
-	dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
 
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
 	if err != nil {
@@ -337,29 +332,6 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
 	}
 
-	// Handle signals by forwarding them to the root container process
-	// (except for panic signal, which should cause a panic).
-	l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
-		// Panic signal should cause a panic.
-		if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
-			panic("Signal-induced panic")
-		}
-
-		// Otherwise forward to root container.
-		deliveryMode := DeliverToProcess
-		if args.Console {
-			// Since we are running with a console, we should
-			// forward the signal to the foreground process group
-			// so that job control signals like ^C can be handled
-			// properly.
-			deliveryMode = DeliverToForegroundProcessGroup
-		}
-		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
-		if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
-			log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
-		}
-	})
-
 	// Create the control server using the provided FD.
 	//
 	// This must be done *after* we have initialized the kernel since the
@@ -535,23 +507,12 @@ func (l *Loader) run() error {
 			return err
 		}
 
-		// Read /etc/passwd for the user's HOME directory and set the HOME
-		// environment variable as required by POSIX if it is not overridden by
-		// the user.
-		hasHomeEnvv := false
-		for _, envv := range l.rootProcArgs.Envv {
-			if strings.HasPrefix(envv, "HOME=") {
-				hasHomeEnvv = true
-			}
-		}
-		if !hasHomeEnvv {
-			homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
-			if err != nil {
-				return fmt.Errorf("error reading exec user: %v", err)
-			}
-
-			l.rootProcArgs.Envv = append(l.rootProcArgs.Envv, "HOME="+homeDir)
+		// Add the HOME enviroment variable if it is not already set.
+		envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		if err != nil {
+			return err
 		}
+		l.rootProcArgs.Envv = envv
 
 		// Create the root container init task. It will begin running
 		// when the kernel is started.
@@ -578,8 +539,27 @@ func (l *Loader) run() error {
 		ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
 	}
 
-	// Start signal forwarding only after an init process is created.
-	l.stopSignalForwarding = l.startSignalForwarding()
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
+
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if l.console {
+			// Since we are running with a console, we should forward the signal to
+			// the foreground process group so that job control signals like ^C can
+			// be handled properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+		}
+	})
 
 	log.Infof("Process should have started...")
 	l.watchdog.Start()
@@ -815,6 +795,16 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	})
 	defer args.MountNamespace.DecRef()
 
+	// Add the HOME enviroment varible if it is not already set.
+	root := args.MountNamespace.Root()
+	defer root.DecRef()
+	ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+	envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+	if err != nil {
+		return 0, err
+	}
+	args.Envv = envv
+
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
 	args.PIDNamespace = tg.PIDNamespace()
@@ -906,22 +896,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	return l.k.GlobalInit().ExitStatus()
 }
 
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
 	switch conf.Network {
 	case NetworkHost:
 		return hostinet.NewStack(), nil
 
 	case NetworkNone, NetworkSandbox:
 		// NetworkNone sets up loopback using netstack.
-		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
-		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4}
-		s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
-			Clock:       clock,
-			Stats:       epsocket.Metrics,
-			HandleLocal: true,
+		netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+		transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+		s := netstack.Stack{stack.New(stack.Options{
+			NetworkProtocols:   netProtos,
+			TransportProtocols: transProtos,
+			Clock:              clock,
+			Stats:              netstack.Metrics,
+			HandleLocal:        true,
 			// Enable raw sockets for users with sufficient
 			// privileges.
-			Raw: true,
+			RawFactory: raw.EndpointFactory{},
+			UniqueID:   uniqueID,
 		})}
 
 		// Enable SACK Recovery.
@@ -929,6 +922,10 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 			return nil, fmt.Errorf("failed to enable SACK: %v", err)
 		}
 
+		// Set default TTLs as required by socket/netstack.
+		s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+		s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
 		// Enable Receive Buffer Auto-Tuning.
 		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
 			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go
new file mode 100644
index 000000000..b9669f2ac
--- /dev/null
+++ b/runsc/boot/loader_amd64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+func init() {
+	// Register the global syscall table.
+	kernel.RegisterSyscallTable(linux.AMD64)
+}
diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go
new file mode 100644
index 000000000..cf64d28c8
--- /dev/null
+++ b/runsc/boot/loader_arm64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package boot
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+func init() {
+	// Register the global syscall table.
+	kernel.RegisterSyscallTable(linux.ARM64)
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index ea0d9f790..dd4926bb9 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -50,12 +50,13 @@ type DefaultRoute struct {
 
 // FDBasedLink configures an fd-based link.
 type FDBasedLink struct {
-	Name        string
-	MTU         int
-	Addresses   []net.IP
-	Routes      []Route
-	GSOMaxSize  uint32
-	LinkAddress net.HardwareAddr
+	Name               string
+	MTU                int
+	Addresses          []net.IP
+	Routes             []Route
+	GSOMaxSize         uint32
+	SoftwareGSOEnabled bool
+	LinkAddress        net.HardwareAddr
 
 	// NumChannels controls how many underlying FD's are to be used to
 	// create this endpoint.
@@ -79,7 +80,8 @@ type CreateLinksAndRoutesArgs struct {
 	LoopbackLinks []LoopbackLink
 	FDBasedLinks  []FDBasedLink
 
-	DefaultGateway DefaultRoute
+	Defaultv4Gateway DefaultRoute
+	Defaultv6Gateway DefaultRoute
 }
 
 // Empty returns true if route hasn't been set.
@@ -163,6 +165,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			Address:            mac,
 			PacketDispatchMode: fdbased.RecvMMsg,
 			GSOMaxSize:         link.GSOMaxSize,
+			SoftwareGSOEnabled: link.SoftwareGSOEnabled,
 			RXChecksumOffload:  true,
 		})
 		if err != nil {
@@ -184,12 +187,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 	}
 
-	if !args.DefaultGateway.Route.Empty() {
-		nicID, ok := nicids[args.DefaultGateway.Name]
+	if !args.Defaultv4Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv4Gateway.Name]
 		if !ok {
-			return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
 		}
-		route, err := args.DefaultGateway.Route.toTcpipRoute(nicID)
+		route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
+	}
+
+	if !args.Defaultv6Gateway.Route.Empty() {
+		nicID, ok := nicids[args.Defaultv6Gateway.Name]
+		if !ok {
+			return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
+		}
+		route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
 		if err != nil {
 			return err
 		}
@@ -203,14 +218,14 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 // createNICWithAddrs creates a NIC in the network stack and adds the given
 // addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP, loopback bool) error {
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error {
 	if loopback {
-		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(linkEP)); err != nil {
-			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil {
+			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, ep, err)
 		}
 	} else {
-		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
-			return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
+			return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err)
 		}
 	}
 
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
index d1d423a5c..56cc12ee0 100644
--- a/runsc/boot/user.go
+++ b/runsc/boot/user.go
@@ -16,6 +16,7 @@ package boot
 
 import (
 	"bufio"
+	"fmt"
 	"io"
 	"strconv"
 	"strings"
@@ -23,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
@@ -42,7 +44,7 @@ func (r *fileReader) Read(buf []byte) (int, error) {
 
 // getExecUserHome returns the home directory of the executing user read from
 // /etc/passwd as read from the container filesystem.
-func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32) (string, error) {
+func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) {
 	// The default user home directory to return if no user matching the user
 	// if found in the /etc/passwd found in the image.
 	const defaultHome = "/"
@@ -82,7 +84,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32
 		File: f,
 	}
 
-	homeDir, err := findHomeInPasswd(uid, r, defaultHome)
+	homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
 	if err != nil {
 		return "", err
 	}
@@ -90,6 +92,28 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32
 	return homeDir, nil
 }
 
+// maybeAddExecUserHome returns a new slice with the HOME enviroment variable
+// set if the slice does not already contain it, otherwise it returns the
+// original slice unmodified.
+func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+	// Check if the envv already contains HOME.
+	for _, env := range envv {
+		if strings.HasPrefix(env, "HOME=") {
+			// We have it. Return the original slice unmodified.
+			return envv, nil
+		}
+	}
+
+	// Read /etc/passwd for the user's HOME directory and set the HOME
+	// environment variable as required by POSIX if it is not overridden by
+	// the user.
+	homeDir, err := getExecUserHome(ctx, mns, uid)
+	if err != nil {
+		return nil, fmt.Errorf("error reading exec user: %v", err)
+	}
+	return append(envv, "HOME="+homeDir), nil
+}
+
 // findHomeInPasswd parses a passwd file and returns the given user's home
 // directory. This function does it's best to replicate the runc's behavior.
 func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) {
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
index 906baf3e5..9aee2ad07 100644
--- a/runsc/boot/user_test.go
+++ b/runsc/boot/user_test.go
@@ -25,6 +25,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
 func setupTempDir() (string, error) {
@@ -68,7 +69,7 @@ func setupPasswd(contents string, perms os.FileMode) func() (string, error) {
 // TestGetExecUserHome tests the getExecUserHome function.
 func TestGetExecUserHome(t *testing.T) {
 	tests := map[string]struct {
-		uid        uint32
+		uid        auth.KUID
 		createRoot func() (string, error)
 		expected   string
 	}{
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index ab3a25b9b..653ca5f52 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -101,6 +101,14 @@ func getValue(path, name string) (string, error) {
 	return string(out), nil
 }
 
+func getInt(path, name string) (int, error) {
+	s, err := getValue(path, name)
+	if err != nil {
+		return 0, err
+	}
+	return strconv.Atoi(strings.TrimSpace(s))
+}
+
 // fillFromAncestor sets the value of a cgroup file from the first ancestor
 // that has content. It does nothing if the file in 'path' has already been set.
 func fillFromAncestor(path string) (string, error) {
@@ -323,6 +331,22 @@ func (c *Cgroup) Join() (func(), error) {
 	return undo, nil
 }
 
+func (c *Cgroup) CPUQuota() (float64, error) {
+	path := c.makePath("cpu")
+	quota, err := getInt(path, "cpu.cfs_quota_us")
+	if err != nil {
+		return -1, err
+	}
+	period, err := getInt(path, "cpu.cfs_period_us")
+	if err != nil {
+		return -1, err
+	}
+	if quota <= 0 || period <= 0 {
+		return -1, err
+	}
+	return float64(quota) / float64(period), nil
+}
+
 // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
 func (c *Cgroup) NumCPU() (int, error) {
 	path := c.makePath("cpuset")
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 7313e473f..f37415810 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,16 +32,17 @@ import (
 
 // Debug implements subcommands.Command for the "debug" command.
 type Debug struct {
-	pid          int
-	stacks       bool
-	signal       int
-	profileHeap  string
-	profileCPU   string
-	profileDelay int
-	trace        string
-	strace       string
-	logLevel     string
-	logPackets   string
+	pid         int
+	stacks      bool
+	signal      int
+	profileHeap string
+	profileCPU  string
+	trace       string
+	strace      string
+	logLevel    string
+	logPackets  string
+	duration    time.Duration
+	ps          bool
 }
 
 // Name implements subcommands.Command.
@@ -65,12 +66,13 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
 	f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
-	f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
+	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
 	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
 	f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`)
 	f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).")
 	f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.")
+	f.BoolVar(&d.ps, "ps", false, "lists processes")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -163,7 +165,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if err := c.Sandbox.StartCPUProfile(f); err != nil {
 			return Errorf(err.Error())
 		}
-		log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
+		log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU)
 	}
 	if d.trace != "" {
 		delay = true
@@ -181,8 +183,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		if err := c.Sandbox.StartTrace(f); err != nil {
 			return Errorf(err.Error())
 		}
-		log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace)
-
+		log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace)
 	}
 
 	if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 {
@@ -241,9 +242,20 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("Logging options changed")
 	}
+	if d.ps {
+		pList, err := c.Processes()
+		if err != nil {
+			Fatalf("getting processes for container: %v", err)
+		}
+		o, err := control.ProcessListToJSON(pList)
+		if err != nil {
+			Fatalf("generating JSON: %v", err)
+		}
+		log.Infof(o)
+	}
 
 	if delay {
-		time.Sleep(time.Duration(d.profileDelay) * time.Second)
+		time.Sleep(d.duration)
 	}
 
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index e817eff77..d1e99243b 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -105,11 +105,11 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute. It starts a process in an
 // already created container.
 func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	e, id, err := ex.parseArgs(f)
+	conf := args[0].(*boot.Config)
+	e, id, err := ex.parseArgs(f, conf.EnableRaw)
 	if err != nil {
 		Fatalf("parsing process spec: %v", err)
 	}
-	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	c, err := container.Load(conf.RootDir, id)
@@ -117,6 +117,9 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("loading sandbox: %v", err)
 	}
 
+	log.Debugf("Exec arguments: %+v", e)
+	log.Debugf("Exec capablities: %+v", e.Capabilities)
+
 	// Replace empty settings with defaults from container.
 	if e.WorkingDirectory == "" {
 		e.WorkingDirectory = c.Spec.Process.Cwd
@@ -127,15 +130,13 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			Fatalf("getting environment variables: %v", err)
 		}
 	}
+
 	if e.Capabilities == nil {
-		// enableRaw is set to true to prevent the filtering out of
-		// CAP_NET_RAW. This is the opposite of Create() because exec
-		// requires the capability to be set explicitly, while 'docker
-		// run' sets it by default.
-		e.Capabilities, err = specutils.Capabilities(true /* enableRaw */, c.Spec.Process.Capabilities)
+		e.Capabilities, err = specutils.Capabilities(conf.EnableRaw, c.Spec.Process.Capabilities)
 		if err != nil {
 			Fatalf("creating capabilities: %v", err)
 		}
+		log.Infof("Using exec capabilities from container: %+v", e.Capabilities)
 	}
 
 	// containerd expects an actual process to represent the container being
@@ -282,14 +283,14 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
 // parseArgs parses exec information from the command line or a JSON file
 // depending on whether the --process flag was used. Returns an ExecArgs and
 // the ID of the container to be used.
-func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
+func (ex *Exec) parseArgs(f *flag.FlagSet, enableRaw bool) (*control.ExecArgs, string, error) {
 	if ex.processPath == "" {
 		// Requires at least a container ID and command.
 		if f.NArg() < 2 {
 			f.Usage()
 			return nil, "", fmt.Errorf("both a container-id and command are required")
 		}
-		e, err := ex.argsFromCLI(f.Args()[1:])
+		e, err := ex.argsFromCLI(f.Args()[1:], enableRaw)
 		return e, f.Arg(0), err
 	}
 	// Requires only the container ID.
@@ -297,11 +298,11 @@ func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
 		f.Usage()
 		return nil, "", fmt.Errorf("a container-id is required")
 	}
-	e, err := ex.argsFromProcessFile()
+	e, err := ex.argsFromProcessFile(enableRaw)
 	return e, f.Arg(0), err
 }
 
-func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
+func (ex *Exec) argsFromCLI(argv []string, enableRaw bool) (*control.ExecArgs, error) {
 	extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
 	for _, s := range ex.extraKGIDs {
 		kgid, err := strconv.Atoi(s)
@@ -314,7 +315,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
 	var caps *auth.TaskCapabilities
 	if len(ex.caps) > 0 {
 		var err error
-		caps, err = capabilities(ex.caps)
+		caps, err = capabilities(ex.caps, enableRaw)
 		if err != nil {
 			return nil, fmt.Errorf("capabilities error: %v", err)
 		}
@@ -332,7 +333,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
 	}, nil
 }
 
-func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
+func (ex *Exec) argsFromProcessFile(enableRaw bool) (*control.ExecArgs, error) {
 	f, err := os.Open(ex.processPath)
 	if err != nil {
 		return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
@@ -342,21 +343,21 @@ func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
 	if err := json.NewDecoder(f).Decode(&p); err != nil {
 		return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
 	}
-	return argsFromProcess(&p)
+	return argsFromProcess(&p, enableRaw)
 }
 
 // argsFromProcess performs all the non-IO conversion from the Process struct
 // to ExecArgs.
-func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
+func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error) {
 	// Create capabilities.
 	var caps *auth.TaskCapabilities
 	if p.Capabilities != nil {
 		var err error
-		// enableRaw is set to true to prevent the filtering out of
-		// CAP_NET_RAW. This is the opposite of Create() because exec
-		// requires the capability to be set explicitly, while 'docker
-		// run' sets it by default.
-		caps, err = specutils.Capabilities(true /* enableRaw */, p.Capabilities)
+		// Starting from Docker 19, capabilities are explicitly set for exec (instead
+		// of nil like before). So we can't distinguish 'exec' from
+		// 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
+		// CAP_NET_RAW in the same way as container start.
+		caps, err = specutils.Capabilities(enableRaw, p.Capabilities)
 		if err != nil {
 			return nil, fmt.Errorf("error creating capabilities: %v", err)
 		}
@@ -409,7 +410,7 @@ func resolveEnvs(envs ...[]string) ([]string, error) {
 // capabilities takes a list of capabilities as strings and returns an
 // auth.TaskCapabilities struct with those capabilities in every capability set.
 // This mimics runc's behavior.
-func capabilities(cs []string) (*auth.TaskCapabilities, error) {
+func capabilities(cs []string, enableRaw bool) (*auth.TaskCapabilities, error) {
 	var specCaps specs.LinuxCapabilities
 	for _, cap := range cs {
 		specCaps.Ambient = append(specCaps.Ambient, cap)
@@ -418,11 +419,11 @@ func capabilities(cs []string) (*auth.TaskCapabilities, error) {
 		specCaps.Inheritable = append(specCaps.Inheritable, cap)
 		specCaps.Permitted = append(specCaps.Permitted, cap)
 	}
-	// enableRaw is set to true to prevent the filtering out of
-	// CAP_NET_RAW. This is the opposite of Create() because exec requires
-	// the capability to be set explicitly, while 'docker run' sets it by
-	// default.
-	return specutils.Capabilities(true /* enableRaw */, &specCaps)
+	// Starting from Docker 19, capabilities are explicitly set for exec (instead
+	// of nil like before). So we can't distinguish 'exec' from
+	// 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
+	// CAP_NET_RAW in the same way as container start.
+	return specutils.Capabilities(enableRaw, &specCaps)
 }
 
 // stringSlice allows a flag to be used multiple times, where each occurrence
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
index eb38a431f..a1e980d08 100644
--- a/runsc/cmd/exec_test.go
+++ b/runsc/cmd/exec_test.go
@@ -91,7 +91,7 @@ func TestCLIArgs(t *testing.T) {
 	}
 
 	for _, tc := range testCases {
-		e, err := tc.ex.argsFromCLI(tc.argv)
+		e, err := tc.ex.argsFromCLI(tc.argv, true)
 		if err != nil {
 			t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err)
 		} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
@@ -144,7 +144,7 @@ func TestJSONArgs(t *testing.T) {
 	}
 
 	for _, tc := range testCases {
-		e, err := argsFromProcess(&tc.p)
+		e, err := argsFromProcess(&tc.p, true)
 		if err != nil {
 			t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err)
 		} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 9faabf494..4831210c0 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -27,6 +27,7 @@ import (
 	"flag"
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/unet"
@@ -135,7 +136,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	//
 	// Note that all mount points have been mounted in the proper location in
 	// setupRootFS().
-	cleanMounts, err := resolveMounts(spec.Mounts, root)
+	cleanMounts, err := resolveMounts(conf, spec.Mounts, root)
 	if err != nil {
 		Fatalf("Failure to resolve mounts: %v", err)
 	}
@@ -182,6 +183,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			cfg := fsgofer.Config{
 				ROMount:      isReadonlyMount(m.Options),
 				PanicOnWrite: g.panicOnWrite,
+				HostUDS:      conf.FSGoferHostUDS,
 			}
 			ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
 			if err != nil {
@@ -200,6 +202,10 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
 	}
 
+	if conf.FSGoferHostUDS {
+		filter.InstallUDSFilters()
+	}
+
 	if err := filter.Install(); err != nil {
 		Fatalf("installing seccomp filters: %v", err)
 	}
@@ -375,7 +381,7 @@ func setupMounts(mounts []specs.Mount, root string) error {
 // Otherwise, it may follow symlinks to locations that would be overwritten
 // with another mount point and return the wrong location. In short, make sure
 // setupMounts() has been called before.
-func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) {
+func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
 	cleanMounts := make([]specs.Mount, 0, len(mounts))
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
@@ -390,8 +396,15 @@ func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) {
 		if err != nil {
 			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
 		}
+
+		opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
+		if err != nil {
+			return nil, err
+		}
+
 		cpy := m
 		cpy.Destination = filepath.Join("/", relDst)
+		cpy.Options = opts
 		cleanMounts = append(cleanMounts, cpy)
 	}
 	return cleanMounts, nil
@@ -418,7 +431,7 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
 		path := filepath.Join(base, name)
 		if !strings.HasPrefix(path, root) {
 			// One cannot '..' their way out of root.
-			path = root
+			base = root
 			continue
 		}
 		fi, err := os.Lstat(path)
@@ -448,3 +461,20 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
 	}
 	return base, nil
 }
+
+// adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
+func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) {
+	rv := make([]string, len(opts))
+	copy(rv, opts)
+
+	if conf.OverlayfsStaleRead {
+		statfs := syscall.Statfs_t{}
+		if err := syscall.Statfs(path, &statfs); err != nil {
+			return nil, err
+		}
+		if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC {
+			rv = append(rv, "overlayfs_stale_read")
+		}
+	}
+	return rv, nil
+}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index bc1fa25e3..2bd12120d 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "container.go",
         "hook.go",
+        "state_file.go",
         "status.go",
     ],
     importpath = "gvisor.dev/gvisor/runsc/container",
@@ -47,6 +48,7 @@ go_test(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 7d67c3a75..5ed131a7f 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/kr/pty"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/testutil"
@@ -219,9 +220,9 @@ func TestJobControlSignalExec(t *testing.T) {
 	// Make sure all the processes are running.
 	expectedPL := []*control.Process{
 		// Root container process.
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		// Bash from exec process.
-		{PID: 2, Cmd: "bash"},
+		{PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
@@ -231,7 +232,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for it to start. Sleep's PPID is bash's PID.
-	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
 	}
@@ -361,7 +362,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 
 	// Wait for bash to start.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "bash"},
+		{PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatal(err)
@@ -371,7 +372,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for sleep to start.
-	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"})
+	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatal(err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index bbb364214..68782c4be 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -17,13 +17,11 @@ package container
 
 import (
 	"context"
-	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"os"
 	"os/exec"
 	"os/signal"
-	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
@@ -31,7 +29,6 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
-	"github.com/gofrs/flock"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
@@ -41,17 +38,6 @@ import (
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
-const (
-	// metadataFilename is the name of the metadata file relative to the
-	// container root directory that holds sandbox metadata.
-	metadataFilename = "meta.json"
-
-	// metadataLockFilename is the name of a lock file in the container
-	// root directory that is used to prevent concurrent modifications to
-	// the container state and metadata.
-	metadataLockFilename = "meta.lock"
-)
-
 // validateID validates the container id.
 func validateID(id string) error {
 	// See libcontainer/factory_linux.go.
@@ -99,11 +85,6 @@ type Container struct {
 	// BundleDir is the directory containing the container bundle.
 	BundleDir string `json:"bundleDir"`
 
-	// Root is the directory containing the container metadata file. If this
-	// container is the root container, Root and RootContainerDir will be the
-	// same.
-	Root string `json:"root"`
-
 	// CreatedAt is the time the container was created.
 	CreatedAt time.Time `json:"createdAt"`
 
@@ -121,21 +102,24 @@ type Container struct {
 	// be 0 if the gofer has been killed.
 	GoferPid int `json:"goferPid"`
 
+	// Sandbox is the sandbox this container is running in. It's set when the
+	// container is created and reset when the sandbox is destroyed.
+	Sandbox *sandbox.Sandbox `json:"sandbox"`
+
+	// Saver handles load from/save to the state file safely from multiple
+	// processes.
+	Saver StateFile `json:"saver"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
 	// goferIsChild is set if a gofer process is a child of the current process.
 	//
 	// This field isn't saved to json, because only a creator of a gofer
 	// process will have it as a child process.
 	goferIsChild bool
-
-	// Sandbox is the sandbox this container is running in. It's set when the
-	// container is created and reset when the sandbox is destroyed.
-	Sandbox *sandbox.Sandbox `json:"sandbox"`
-
-	// RootContainerDir is the root directory containing the metadata file of the
-	// sandbox root container. It's used to lock in order to serialize creating
-	// and deleting this Container's metadata directory. If this container is the
-	// root container, this is the same as Root.
-	RootContainerDir string
 }
 
 // loadSandbox loads all containers that belong to the sandbox with the given
@@ -166,43 +150,35 @@ func loadSandbox(rootDir, id string) ([]*Container, error) {
 	return containers, nil
 }
 
-// Load loads a container with the given id from a metadata file. id may be an
-// abbreviation of the full container id, in which case Load loads the
-// container to which id unambiguously refers to.
-// Returns ErrNotExist if container doesn't exist.
-func Load(rootDir, id string) (*Container, error) {
-	log.Debugf("Load container %q %q", rootDir, id)
-	if err := validateID(id); err != nil {
+// Load loads a container with the given id from a metadata file. partialID may
+// be an abbreviation of the full container id, in which case Load loads the
+// container to which id unambiguously refers to. Returns ErrNotExist if
+// container doesn't exist.
+func Load(rootDir, partialID string) (*Container, error) {
+	log.Debugf("Load container %q %q", rootDir, partialID)
+	if err := validateID(partialID); err != nil {
 		return nil, fmt.Errorf("validating id: %v", err)
 	}
 
-	cRoot, err := findContainerRoot(rootDir, id)
+	id, err := findContainerID(rootDir, partialID)
 	if err != nil {
 		// Preserve error so that callers can distinguish 'not found' errors.
 		return nil, err
 	}
 
-	// Lock the container metadata to prevent other runsc instances from
-	// writing to it while we are reading it.
-	unlock, err := lockContainerMetadata(cRoot)
-	if err != nil {
-		return nil, err
+	state := StateFile{
+		RootDir: rootDir,
+		ID:      id,
 	}
-	defer unlock()
+	defer state.close()
 
-	// Read the container metadata file and create a new Container from it.
-	metaFile := filepath.Join(cRoot, metadataFilename)
-	metaBytes, err := ioutil.ReadFile(metaFile)
-	if err != nil {
+	c := &Container{}
+	if err := state.load(c); err != nil {
 		if os.IsNotExist(err) {
 			// Preserve error so that callers can distinguish 'not found' errors.
 			return nil, err
 		}
-		return nil, fmt.Errorf("reading container metadata file %q: %v", metaFile, err)
-	}
-	var c Container
-	if err := json.Unmarshal(metaBytes, &c); err != nil {
-		return nil, fmt.Errorf("unmarshaling container metadata from %q: %v", metaFile, err)
+		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
 	}
 
 	// If the status is "Running" or "Created", check that the sandbox
@@ -223,57 +199,37 @@ func Load(rootDir, id string) (*Container, error) {
 		}
 	}
 
-	return &c, nil
+	return c, nil
 }
 
-func findContainerRoot(rootDir, partialID string) (string, error) {
+func findContainerID(rootDir, partialID string) (string, error) {
 	// Check whether the id fully specifies an existing container.
-	cRoot := filepath.Join(rootDir, partialID)
-	if _, err := os.Stat(cRoot); err == nil {
-		return cRoot, nil
+	stateFile := buildStatePath(rootDir, partialID)
+	if _, err := os.Stat(stateFile); err == nil {
+		return partialID, nil
 	}
 
 	// Now see whether id could be an abbreviation of exactly 1 of the
 	// container ids. If id is ambiguous (it could match more than 1
 	// container), it is an error.
-	cRoot = ""
 	ids, err := List(rootDir)
 	if err != nil {
 		return "", err
 	}
+	rv := ""
 	for _, id := range ids {
 		if strings.HasPrefix(id, partialID) {
-			if cRoot != "" {
-				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, cRoot, id)
+			if rv != "" {
+				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
 			}
-			cRoot = id
+			rv = id
 		}
 	}
-	if cRoot == "" {
+	if rv == "" {
 		return "", os.ErrNotExist
 	}
-	log.Debugf("abbreviated id %q resolves to full id %q", partialID, cRoot)
-	return filepath.Join(rootDir, cRoot), nil
-}
-
-// List returns all container ids in the given root directory.
-func List(rootDir string) ([]string, error) {
-	log.Debugf("List containers %q", rootDir)
-	fs, err := ioutil.ReadDir(rootDir)
-	if err != nil {
-		return nil, fmt.Errorf("reading dir %q: %v", rootDir, err)
-	}
-	var out []string
-	for _, f := range fs {
-		// Filter out directories that do no belong to a container.
-		cid := f.Name()
-		if validateID(cid) == nil {
-			if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil {
-				out = append(out, f.Name())
-			}
-		}
-	}
-	return out, nil
+	log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv)
+	return rv, nil
 }
 
 // Args is used to configure a new container.
@@ -316,44 +272,34 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 		return nil, err
 	}
 
-	unlockRoot, err := maybeLockRootContainer(args.Spec, conf.RootDir)
-	if err != nil {
-		return nil, err
+	if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
+		return nil, fmt.Errorf("creating container root directory: %v", err)
 	}
-	defer unlockRoot()
+
+	c := &Container{
+		ID:            args.ID,
+		Spec:          args.Spec,
+		ConsoleSocket: args.ConsoleSocket,
+		BundleDir:     args.BundleDir,
+		Status:        Creating,
+		CreatedAt:     time.Now(),
+		Owner:         os.Getenv("USER"),
+		Saver: StateFile{
+			RootDir: conf.RootDir,
+			ID:      args.ID,
+		},
+	}
+	// The Cleanup object cleans up partially created containers when an error
+	// occurs. Any errors occurring during cleanup itself are ignored.
+	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+	defer cu.Clean()
 
 	// Lock the container metadata file to prevent concurrent creations of
 	// containers with the same id.
-	containerRoot := filepath.Join(conf.RootDir, args.ID)
-	unlock, err := lockContainerMetadata(containerRoot)
-	if err != nil {
+	if err := c.Saver.lockForNew(); err != nil {
 		return nil, err
 	}
-	defer unlock()
-
-	// Check if the container already exists by looking for the metadata
-	// file.
-	if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil {
-		return nil, fmt.Errorf("container with id %q already exists", args.ID)
-	} else if !os.IsNotExist(err) {
-		return nil, fmt.Errorf("looking for existing container in %q: %v", containerRoot, err)
-	}
-
-	c := &Container{
-		ID:               args.ID,
-		Spec:             args.Spec,
-		ConsoleSocket:    args.ConsoleSocket,
-		BundleDir:        args.BundleDir,
-		Root:             containerRoot,
-		Status:           Creating,
-		CreatedAt:        time.Now(),
-		Owner:            os.Getenv("USER"),
-		RootContainerDir: conf.RootDir,
-	}
-	// The Cleanup object cleans up partially created containers when an error occurs.
-	// Any errors occuring during cleanup itself are ignored.
-	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
-	defer cu.Clean()
+	defer c.Saver.unlock()
 
 	// If the metadata annotations indicate that this container should be
 	// started in an existing sandbox, we must do so. The metadata will
@@ -431,7 +377,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	c.changeStatus(Created)
 
 	// Save the metadata file.
-	if err := c.save(); err != nil {
+	if err := c.saveLocked(); err != nil {
 		return nil, err
 	}
 
@@ -451,17 +397,12 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 func (c *Container) Start(conf *boot.Config) error {
 	log.Debugf("Start container %q", c.ID)
 
-	unlockRoot, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlockRoot()
+	unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+	defer unlock.Clean()
 
-	unlock, err := c.lock()
-	if err != nil {
-		return err
-	}
-	defer unlock()
 	if err := c.requireStatus("start", Created); err != nil {
 		return err
 	}
@@ -509,24 +450,31 @@ func (c *Container) Start(conf *boot.Config) error {
 	}
 
 	c.changeStatus(Running)
-	if err := c.save(); err != nil {
+	if err := c.saveLocked(); err != nil {
 		return err
 	}
 
-	// Adjust the oom_score_adj for sandbox and gofers. This must be done after
-	// save().
-	return c.adjustOOMScoreAdj(conf)
+	// Release lock before adjusting OOM score because the lock is acquired there.
+	unlock.Clean()
+
+	// Adjust the oom_score_adj for sandbox. This must be done after saveLocked().
+	if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Saver.RootDir, false); err != nil {
+		return err
+	}
+
+	// Set container's oom_score_adj to the gofer since it is dedicated to
+	// the container, in case the gofer uses up too much memory.
+	return c.adjustGoferOOMScoreAdj()
 }
 
 // Restore takes a container and replaces its kernel and file system
 // to restore a container from its state file.
 func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
 	log.Debugf("Restore container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if err := c.requireStatus("restore", Created); err != nil {
 		return err
@@ -544,7 +492,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 		return err
 	}
 	c.changeStatus(Running)
-	return c.save()
+	return c.saveLocked()
 }
 
 // Run is a helper that calls Create + Start + Wait.
@@ -704,11 +652,10 @@ func (c *Container) Checkpoint(f *os.File) error {
 // The call only succeeds if the container's status is created or running.
 func (c *Container) Pause() error {
 	log.Debugf("Pausing container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if c.Status != Created && c.Status != Running {
 		return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
@@ -718,18 +665,17 @@ func (c *Container) Pause() error {
 		return fmt.Errorf("pausing container: %v", err)
 	}
 	c.changeStatus(Paused)
-	return c.save()
+	return c.saveLocked()
 }
 
 // Resume unpauses the container and its kernel.
 // The call only succeeds if the container's status is paused.
 func (c *Container) Resume() error {
 	log.Debugf("Resuming container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if c.Status != Paused {
 		return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
@@ -738,7 +684,7 @@ func (c *Container) Resume() error {
 		return fmt.Errorf("resuming container: %v", err)
 	}
 	c.changeStatus(Running)
-	return c.save()
+	return c.saveLocked()
 }
 
 // State returns the metadata of the container.
@@ -766,6 +712,17 @@ func (c *Container) Processes() ([]*control.Process, error) {
 func (c *Container) Destroy() error {
 	log.Debugf("Destroy container %q", c.ID)
 
+	if err := c.Saver.lock(); err != nil {
+		return err
+	}
+	defer func() {
+		c.Saver.unlock()
+		c.Saver.close()
+	}()
+
+	// Stored for later use as stop() sets c.Sandbox to nil.
+	sb := c.Sandbox
+
 	// We must perform the following cleanup steps:
 	// * stop the container and gofer processes,
 	// * remove the container filesystem on the host, and
@@ -775,35 +732,43 @@ func (c *Container) Destroy() error {
 	// do our best to perform all of the cleanups. Hence, we keep a slice
 	// of errors return their concatenation.
 	var errs []string
-
-	unlock, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
-	if err != nil {
-		return err
-	}
-	defer unlock()
-
 	if err := c.stop(); err != nil {
 		err = fmt.Errorf("stopping container: %v", err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
 
-	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
-		err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err)
+	if err := c.Saver.destroy(); err != nil {
+		err = fmt.Errorf("deleting container state files: %v", err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
 
 	c.changeStatus(Stopped)
 
+	// Adjust oom_score_adj for the sandbox. This must be done after the container
+	// is stopped and the directory at c.Root is removed. Adjustment can be
+	// skipped if the root container is exiting, because it brings down the entire
+	// sandbox.
+	//
+	// Use 'sb' to tell whether it has been executed before because Destroy must
+	// be idempotent.
+	if sb != nil && !isRoot(c.Spec) {
+		if err := adjustSandboxOOMScoreAdj(sb, c.Saver.RootDir, true); err != nil {
+			errs = append(errs, err.Error())
+		}
+	}
+
 	// "If any poststop hook fails, the runtime MUST log a warning, but the
-	// remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
-	// Based on the OCI, "The post-stop hooks MUST be called after the container is
-	// deleted but before the delete operation returns"
+	// remaining hooks and lifecycle continue as if the hook had
+	// succeeded" - OCI spec.
+	//
+	// Based on the OCI, "The post-stop hooks MUST be called after the container
+	// is deleted but before the delete operation returns"
 	// Run it here to:
 	// 1) Conform to the OCI.
-	// 2) Make sure it only runs once, because the root has been deleted, the container
-	// can't be loaded again.
+	// 2) Make sure it only runs once, because the root has been deleted, the
+	// container can't be loaded again.
 	if c.Spec.Hooks != nil {
 		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
 	}
@@ -814,18 +779,13 @@ func (c *Container) Destroy() error {
 	return fmt.Errorf(strings.Join(errs, "\n"))
 }
 
-// save saves the container metadata to a file.
+// saveLocked saves the container metadata to a file.
 //
 // Precondition: container must be locked with container.lock().
-func (c *Container) save() error {
+func (c *Container) saveLocked() error {
 	log.Debugf("Save container %q", c.ID)
-	metaFile := filepath.Join(c.Root, metadataFilename)
-	meta, err := json.Marshal(c)
-	if err != nil {
-		return fmt.Errorf("invalid container metadata: %v", err)
-	}
-	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
-		return fmt.Errorf("writing container metadata: %v", err)
+	if err := c.Saver.saveLocked(c); err != nil {
+		return fmt.Errorf("saving container metadata: %v", err)
 	}
 	return nil
 }
@@ -926,7 +886,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	if conf.DebugLog != "" {
-		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer")
+		test := ""
+		if len(conf.TestOnlyTestNameEnv) != 0 {
+			// Fetch test name if one is provided and the test only flag was set.
+			if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+				test = t
+			}
+		}
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer", test)
 		if err != nil {
 			return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
 		}
@@ -1079,50 +1046,8 @@ func (c *Container) requireStatus(action string, statuses ...Status) error {
 	return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
 }
 
-// lock takes a file lock on the container metadata lock file.
-func (c *Container) lock() (func() error, error) {
-	return lockContainerMetadata(filepath.Join(c.Root, c.ID))
-}
-
-// lockContainerMetadata takes a file lock on the metadata lock file in the
-// given container root directory.
-func lockContainerMetadata(containerRootDir string) (func() error, error) {
-	if err := os.MkdirAll(containerRootDir, 0711); err != nil {
-		return nil, fmt.Errorf("creating container root directory %q: %v", containerRootDir, err)
-	}
-	f := filepath.Join(containerRootDir, metadataLockFilename)
-	l := flock.NewFlock(f)
-	if err := l.Lock(); err != nil {
-		return nil, fmt.Errorf("acquiring lock on container lock file %q: %v", f, err)
-	}
-	return l.Unlock, nil
-}
-
-// maybeLockRootContainer locks the sandbox root container. It is used to
-// prevent races to create and delete child container sandboxes.
-func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, error) {
-	if isRoot(spec) {
-		return func() error { return nil }, nil
-	}
-
-	sbid, ok := specutils.SandboxID(spec)
-	if !ok {
-		return nil, fmt.Errorf("no sandbox ID found when locking root container")
-	}
-	sb, err := Load(rootDir, sbid)
-	if err != nil {
-		return nil, err
-	}
-
-	unlock, err := sb.lock()
-	if err != nil {
-		return nil, err
-	}
-	return unlock, nil
-}
-
 func isRoot(spec *specs.Spec) bool {
-	return specutils.ShouldCreateSandbox(spec)
+	return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer
 }
 
 // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
@@ -1139,33 +1064,85 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
 	return fn()
 }
 
-// adjustOOMScoreAdj sets the oom_score_adj for the sandbox and all gofers.
+// adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer.
+func (c *Container) adjustGoferOOMScoreAdj() error {
+	if c.GoferPid != 0 && c.Spec.Process.OOMScoreAdj != nil {
+		if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
+			// Ignore NotExist error because it can be returned when the sandbox
+			// exited while OOM score was being adjusted.
+			if !os.IsNotExist(err) {
+				return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
+			}
+			log.Warningf("Gofer process (%d) not found setting oom_score_adj", c.GoferPid)
+		}
+	}
+
+	return nil
+}
+
+// adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox.
 // oom_score_adj is set to the lowest oom_score_adj among the containers
 // running in the sandbox.
 //
 // TODO(gvisor.dev/issue/512): This call could race with other containers being
 // created at the same time and end up setting the wrong oom_score_adj to the
 // sandbox.
-func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
-	// If this container's OOMScoreAdj is nil then we can exit early as no
-	// change should be made to oom_score_adj for the sandbox.
-	if c.Spec.Process.OOMScoreAdj == nil {
-		return nil
-	}
-
-	containers, err := loadSandbox(conf.RootDir, c.Sandbox.ID)
+func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) error {
+	containers, err := loadSandbox(rootDir, s.ID)
 	if err != nil {
 		return fmt.Errorf("loading sandbox containers: %v", err)
 	}
 
+	// Do nothing if the sandbox has been terminated.
+	if len(containers) == 0 {
+		return nil
+	}
+
 	// Get the lowest score for all containers.
 	var lowScore int
 	scoreFound := false
-	for _, container := range containers {
-		if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+	if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified {
+		// This is a single-container sandbox. Set the oom_score_adj to
+		// the value specified in the OCI bundle.
+		if containers[0].Spec.Process.OOMScoreAdj != nil {
 			scoreFound = true
-			lowScore = *container.Spec.Process.OOMScoreAdj
+			lowScore = *containers[0].Spec.Process.OOMScoreAdj
 		}
+	} else {
+		for _, container := range containers {
+			// Special multi-container support for CRI. Ignore the root
+			// container when calculating oom_score_adj for the sandbox because
+			// it is the infrastructure (pause) container and always has a very
+			// low oom_score_adj.
+			//
+			// We will use OOMScoreAdj in the single-container case where the
+			// containerd container-type annotation is not present.
+			if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox {
+				continue
+			}
+
+			if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+				scoreFound = true
+				lowScore = *container.Spec.Process.OOMScoreAdj
+			}
+		}
+	}
+
+	// If the container is destroyed and remaining containers have no
+	// oomScoreAdj specified then we must revert to the oom_score_adj of the
+	// parent process.
+	if !scoreFound && destroy {
+		ppid, err := specutils.GetParentPid(s.Pid)
+		if err != nil {
+			return fmt.Errorf("getting parent pid of sandbox pid %d: %v", s.Pid, err)
+		}
+		pScore, err := specutils.GetOOMScoreAdj(ppid)
+		if err != nil {
+			return fmt.Errorf("getting oom_score_adj of parent %d: %v", ppid, err)
+		}
+
+		scoreFound = true
+		lowScore = pScore
 	}
 
 	// Only set oom_score_adj if one of the containers has oom_score_adj set
@@ -1177,15 +1154,15 @@ func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
 	}
 
 	// Set the lowest of all containers oom_score_adj to the sandbox.
-	if err := setOOMScoreAdj(c.Sandbox.Pid, lowScore); err != nil {
-		return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", c.Sandbox.ID, err)
+	if err := setOOMScoreAdj(s.Pid, lowScore); err != nil {
+		// Ignore NotExist error because it can be returned when the sandbox
+		// exited while OOM score was being adjusted.
+		if !os.IsNotExist(err) {
+			return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err)
+		}
+		log.Warningf("Sandbox process (%d) not found setting oom_score_adj", s.Pid)
 	}
 
-	// Set container's oom_score_adj to the gofer since it is dedicated to the
-	// container, in case the gofer uses up too much memory.
-	if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
-		return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
-	}
 	return nil
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 2ac12e5b6..c10f85992 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -34,8 +34,10 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
@@ -51,8 +53,9 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
 			err = fmt.Errorf("error getting process data from container: %v", err)
 			return &backoff.PermanentError{Err: err}
 		}
-		if !procListsEqual(got, want) {
-			return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
+		if r, err := procListsEqual(got, want); !r {
+			return fmt.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(got), procListToString(want), err)
 		}
 		return nil
 	}
@@ -90,22 +93,34 @@ func blockUntilWaitable(pid int) error {
 
 // procListsEqual is used to check whether 2 Process lists are equal for all
 // implemented fields.
-func procListsEqual(got, want []*control.Process) bool {
+func procListsEqual(got, want []*control.Process) (bool, error) {
 	if len(got) != len(want) {
-		return false
+		return false, nil
 	}
 	for i := range got {
 		pd1 := got[i]
 		pd2 := want[i]
-		// Zero out unimplemented and timing dependant fields.
+		// Zero out timing dependant fields.
 		pd1.Time = ""
 		pd1.STime = ""
 		pd1.C = 0
-		if *pd1 != *pd2 {
-			return false
+		// Ignore TTY field too, since it's not relevant in the cases
+		// where we use this method. Tests that care about the TTY
+		// field should check for it themselves.
+		pd1.TTY = ""
+		pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
+		if err != nil {
+			return false, err
+		}
+		pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
+		if err != nil {
+			return false, err
+		}
+		if pd1Json != pd2Json {
+			return false, nil
 		}
 	}
-	return true
+	return true, nil
 }
 
 // getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
@@ -115,7 +130,11 @@ func getAndCheckProcLists(cont *Container, want []*control.Process) error {
 	if err != nil {
 		return fmt.Errorf("error getting process data from container: %v", err)
 	}
-	if procListsEqual(got, want) {
+	equal, err := procListsEqual(got, want)
+	if err != nil {
+		return err
+	}
+	if equal {
 		return nil
 	}
 	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
@@ -287,11 +306,12 @@ func TestLifecycle(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 		}
 		// Create the container.
@@ -589,18 +609,20 @@ func TestExec(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 
@@ -1061,18 +1083,20 @@ func TestPauseResume(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "bash",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "bash",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 
@@ -1125,11 +1149,12 @@ func TestPauseResume(t *testing.T) {
 
 		expectedPL2 := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 		}
 
@@ -1240,18 +1265,20 @@ func TestCapabilities(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "exe",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "exe",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
@@ -1547,7 +1574,8 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfigWithRoot(rootDir)
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
 
 	cids := []string{
 		"foo-" + testutil.UniqueContainerID(),
@@ -2049,6 +2077,156 @@ func TestMountSymlink(t *testing.T) {
 	}
 }
 
+// Check that --net-raw disables the CAP_NET_RAW capability.
+func TestNetRaw(t *testing.T) {
+	capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10)
+	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	for _, enableRaw := range []bool{true, false} {
+		conf := testutil.TestConfig()
+		conf.EnableRaw = enableRaw
+
+		test := "--enabled"
+		if !enableRaw {
+			test = "--disabled"
+		}
+
+		spec := testutil.NewSpecWithArgs(app, "capability", test, capNetRaw)
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("Error running container: %v", err)
+		}
+	}
+}
+
+// TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works.
+func TestOverlayfsStaleRead(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.OverlayfsStaleRead = true
+
+	in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed: %v", err)
+	}
+	defer in.Close()
+	if _, err := in.WriteString("stale data"); err != nil {
+		t.Fatalf("in.Write() failed: %v", err)
+	}
+
+	out, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.out")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed: %v", err)
+	}
+	defer out.Close()
+
+	const want = "foobar"
+	cmd := fmt.Sprintf("cat %q && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name())
+	spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd)
+	if err := run(spec, conf); err != nil {
+		t.Fatalf("Error running container: %v", err)
+	}
+
+	gotBytes, err := ioutil.ReadAll(out)
+	if err != nil {
+		t.Fatalf("out.Read() failed: %v", err)
+	}
+	got := strings.TrimSpace(string(gotBytes))
+	if want != got {
+		t.Errorf("Wrong content in out file, got: %q. want: %q", got, want)
+	}
+}
+
+// TestTTYField checks TTY field returned by container.Processes().
+func TestTTYField(t *testing.T) {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	testApp, err := testutil.FindFile("runsc/container/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	testCases := []struct {
+		name         string
+		useTTY       bool
+		wantTTYField string
+	}{
+		{
+			name:         "no tty",
+			useTTY:       false,
+			wantTTYField: "?",
+		},
+		{
+			name:         "tty used",
+			useTTY:       true,
+			wantTTYField: "pts/0",
+		},
+	}
+
+	for _, test := range testCases {
+		t.Run(test.name, func(t *testing.T) {
+			conf := testutil.TestConfig()
+
+			// We will run /bin/sleep, possibly with an open TTY.
+			cmd := []string{"/bin/sleep", "10000"}
+			if test.useTTY {
+				// Run inside the "pty-runner".
+				cmd = append([]string{testApp, "pty-runner"}, cmd...)
+			}
+
+			spec := testutil.NewSpecWithArgs(cmd...)
+			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(rootDir)
+			defer os.RemoveAll(bundleDir)
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.UniqueContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Wait for sleep to be running, and check the TTY
+			// field.
+			var gotTTYField string
+			cb := func() error {
+				ps, err := c.Processes()
+				if err != nil {
+					err = fmt.Errorf("error getting process data from container: %v", err)
+					return &backoff.PermanentError{Err: err}
+				}
+				for _, p := range ps {
+					if strings.Contains(p.Cmd, "sleep") {
+						gotTTYField = p.TTY
+						return nil
+					}
+				}
+				return fmt.Errorf("sleep not running")
+			}
+			if err := testutil.Poll(cb, 30*time.Second); err != nil {
+				t.Fatalf("error waiting for sleep process: %v", err)
+			}
+
+			if gotTTYField != test.wantTTYField {
+				t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
+			}
+		})
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index bd45a5118..4ad09ceab 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -60,13 +60,8 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 }
 
 func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
-	// Setup root dir if one hasn't been provided.
 	if len(conf.RootDir) == 0 {
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			return nil, nil, fmt.Errorf("error creating root dir: %v", err)
-		}
-		conf.RootDir = rootDir
+		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
 
 	var containers []*Container
@@ -78,7 +73,6 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		for _, b := range bundles {
 			os.RemoveAll(b)
 		}
-		os.RemoveAll(conf.RootDir)
 	}
 	for i, spec := range specs {
 		bundleDir, err := testutil.SetupBundleDir(spec)
@@ -129,11 +123,11 @@ func execMany(execs []execDesc) error {
 
 func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 	for _, spec := range pod {
-		spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
-		spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
-		spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+		spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source
+		spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type
+		spec.Annotations[boot.MountPrefix+name+".share"] = "pod"
 		if len(mount.Options) > 0 {
-			spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+			spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",")
 		}
 	}
 }
@@ -144,6 +138,13 @@ func TestMultiContainerSanity(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		specs, ids := createSpecs(sleep, sleep)
@@ -155,13 +156,13 @@ func TestMultiContainerSanity(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -175,6 +176,13 @@ func TestMultiPIDNS(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		testSpecs, ids := createSpecs(sleep, sleep)
@@ -194,13 +202,13 @@ func TestMultiPIDNS(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -213,6 +221,13 @@ func TestMultiPIDNSPath(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		testSpecs, ids := createSpecs(sleep, sleep, sleep)
@@ -249,7 +264,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -259,7 +274,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 		}
 
 		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -268,13 +283,21 @@ func TestMultiPIDNSPath(t *testing.T) {
 }
 
 func TestMultiContainerWait(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// The first container should run the entire duration of the test.
 	cmd1 := []string{"sleep", "100"}
 	// We'll wait on the second container, which is much shorter lived.
 	cmd2 := []string{"sleep", "1"}
 	specs, ids := createSpecs(cmd1, cmd2)
 
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -283,7 +306,7 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -328,7 +351,7 @@ func TestMultiContainerWait(t *testing.T) {
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -344,12 +367,14 @@ func TestExecWait(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// The first container should run the entire duration of the test.
 	cmd1 := []string{"sleep", "100"}
 	// We'll wait on the second container, which is much shorter lived.
 	cmd2 := []string{"sleep", "1"}
 	specs, ids := createSpecs(cmd1, cmd2)
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -358,7 +383,7 @@ func TestExecWait(t *testing.T) {
 
 	// Check via ps that process is running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -393,7 +418,7 @@ func TestExecWait(t *testing.T) {
 
 	// Wait for the exec'd process to exit.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -432,7 +457,15 @@ func TestMultiContainerMount(t *testing.T) {
 	})
 
 	// Setup the containers.
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	containers, cleanup, err := startContainers(conf, sps, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -454,6 +487,13 @@ func TestMultiContainerSignal(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		specs, ids := createSpecs(sleep, sleep)
@@ -465,7 +505,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Check via ps that container 1 process is running.
 		expectedPL := []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
@@ -479,7 +519,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Make sure process 1 is still running.
 		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -548,6 +588,13 @@ func TestMultiContainerDestroy(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// First container will remain intact while the second container is killed.
 		podSpecs, ids := createSpecs(
 			[]string{"sleep", "100"},
@@ -586,9 +633,10 @@ func TestMultiContainerDestroy(t *testing.T) {
 		if err != nil {
 			t.Fatalf("error getting process data from sandbox: %v", err)
 		}
-		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep"}}
-		if !procListsEqual(pss, expectedPL) {
-			t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
+		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+		if r, err := procListsEqual(pss, expectedPL); !r {
+			t.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(pss), procListToString(expectedPL), err)
 		}
 
 		// Check that cont.Destroy is safe to call multiple times.
@@ -599,13 +647,21 @@ func TestMultiContainerDestroy(t *testing.T) {
 }
 
 func TestMultiContainerProcesses(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
 	// will just execve into 'sleep' and both containers will look the
 	// same.
 	specs, ids := createSpecs(
 		[]string{"sleep", "100"},
 		[]string{"sh", "-c", "{ sleep 100; }"})
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -614,7 +670,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Check root's container process list doesn't include other containers.
 	expectedPL0 := []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -622,8 +678,8 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Same for the other container.
 	expectedPL1 := []*control.Process{
-		{PID: 2, Cmd: "sh"},
-		{PID: 3, PPID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
+		{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -637,7 +693,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 	if _, err := containers[1].Execute(args); err != nil {
 		t.Fatalf("error exec'ing: %v", err)
 	}
-	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"})
+	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
 	}
@@ -650,6 +706,15 @@ func TestMultiContainerProcesses(t *testing.T) {
 // TestMultiContainerKillAll checks that all process that belong to a container
 // are killed when SIGKILL is sent to *all* processes in that container.
 func TestMultiContainerKillAll(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	for _, tc := range []struct {
 		killContainer bool
 	}{
@@ -665,7 +730,6 @@ func TestMultiContainerKillAll(t *testing.T) {
 		specs, ids := createSpecs(
 			[]string{app, "task-tree", "--depth=2", "--width=2"},
 			[]string{app, "task-tree", "--depth=4", "--width=2"})
-		conf := testutil.TestConfig()
 		containers, cleanup, err := startContainers(conf, specs, ids)
 		if err != nil {
 			t.Fatalf("error starting containers: %v", err)
@@ -739,19 +803,13 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 	specs, ids := createSpecs(
 		[]string{"/bin/sleep", "100"},
 		[]string{"/bin/sleep", "100"})
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfigWithRoot(rootDir)
-
-	// Create and start root container.
-	rootBundleDir, err := testutil.SetupBundleDir(specs[0])
+	conf := testutil.TestConfig()
+	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(rootBundleDir)
 
 	rootArgs := Args{
@@ -800,19 +858,12 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	}
 	specs, ids := createSpecs(cmds...)
 
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	conf := testutil.TestConfigWithRoot(rootDir)
-
-	// Create and start root container.
-	rootBundleDir, err := testutil.SetupBundleDir(specs[0])
+	conf := testutil.TestConfig()
+	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(rootBundleDir)
 
 	rootArgs := Args{
@@ -886,9 +937,17 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 	script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename)
 	cmd := []string{"sh", "-c", script}
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Make sure overlay is enabled, and none of the root filesystems are
 	// read-only, otherwise we won't be able to create the file.
-	conf := testutil.TestConfig()
 	conf.Overlay = true
 	specs, ids := createSpecs(cmdRoot, cmd, cmd)
 	for _, s := range specs {
@@ -941,26 +1000,21 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 	}
 	allSpecs, allIDs := createSpecs(cmds...)
 
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
 	// Split up the specs and IDs.
 	rootSpec := allSpecs[0]
 	rootID := allIDs[0]
 	childrenSpecs := allSpecs[1:]
 	childrenIDs := allIDs[1:]
 
-	bundleDir, err := testutil.SetupBundleDir(rootSpec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf)
 	if err != nil {
-		t.Fatalf("error setting up bundle dir: %v", err)
+		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
 	// Start root container.
-	conf := testutil.TestConfigWithRoot(rootDir)
 	rootArgs := Args{
 		ID:        rootID,
 		Spec:      rootSpec,
@@ -1029,6 +1083,13 @@ func TestMultiContainerSharedMount(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1137,6 +1198,13 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1197,6 +1265,13 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1297,6 +1372,59 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 	}
 }
 
+// Test that unsupported pod mounts options are ignored when matching master and
+// slave mounts.
+func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
+	// Setup the containers.
+	sleep := []string{"/bin/sleep", "100"}
+	podSpec, ids := createSpecs(sleep, sleep)
+	mnt0 := specs.Mount{
+		Destination: "/mydir/test",
+		Source:      "/some/dir",
+		Type:        "tmpfs",
+		Options:     []string{"rw", "rbind", "relatime"},
+	}
+	podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+	mnt1 := mnt0
+	mnt1.Destination = "/mydir2/test2"
+	mnt1.Options = []string{"rw", "nosuid"}
+	podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+	createSharedMount(mnt0, "test-mount", podSpec...)
+
+	containers, cleanup, err := startContainers(conf, podSpec, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	execs := []execDesc{
+		{
+			c:    containers[0],
+			cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+			desc: "directory is mounted in container0",
+		},
+		{
+			c:    containers[1],
+			cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+			desc: "directory is mounted in container1",
+		},
+	}
+	if err := execMany(execs); err != nil {
+		t.Fatal(err.Error())
+	}
+}
+
 // Test that one container can send an FD to another container, even though
 // they have distinct MountNamespaces.
 func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
@@ -1329,6 +1457,15 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 		Type:        "tmpfs",
 	}
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Create the specs.
 	specs, ids := createSpecs(
 		[]string{"sleep", "1000"},
@@ -1339,7 +1476,6 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 	specs[1].Mounts = append(specs[2].Mounts, sharedMnt, writeableMnt)
 	specs[2].Mounts = append(specs[1].Mounts, sharedMnt)
 
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -1358,9 +1494,17 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 
 // Test that container is destroyed when Gofer is killed.
 func TestMultiContainerGoferKilled(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	sleep := []string{"sleep", "100"}
 	specs, ids := createSpecs(sleep, sleep, sleep)
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -1370,7 +1514,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Ensure container is running
 	c := containers[2]
 	expectedPL := []*control.Process{
-		{PID: 3, Cmd: "sleep"},
+		{PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1398,7 +1542,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 			continue // container[2] has been killed.
 		}
 		pl := []*control.Process{
-			{PID: kernel.ThreadID(i + 1), Cmd: "sleep"},
+			{PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
 		}
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
@@ -1418,7 +1562,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
 	// and RPC errors out.
 	impossiblePL := []*control.Process{
-		{PID: 100, Cmd: "non-existent-process"},
+		{PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
 	}
 	if err := waitForProcessList(c, impossiblePL); err == nil {
 		t.Fatalf("Sandbox was not killed after gofer death")
@@ -1436,7 +1580,15 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 func TestMultiContainerLoadSandbox(t *testing.T) {
 	sleep := []string{"sleep", "100"}
 	specs, ids := createSpecs(sleep, sleep, sleep)
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
 
 	// Create containers for the sandbox.
 	wants, cleanup, err := startContainers(conf, specs, ids)
@@ -1529,7 +1681,15 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 		Type:        "bind",
 	})
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	pod, cleanup, err := startContainers(conf, podSpecs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
new file mode 100644
index 000000000..d95151ea5
--- /dev/null
+++ b/runsc/container/state_file.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/gofrs/flock"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+const stateFileExtension = ".state"
+
+// StateFile handles load from/save to container state safely from multiple
+// processes. It uses a lock file to provide synchronization between operations.
+//
+// The lock file is located at: "${s.RootDir}/${s.ID}.lock".
+// The state file is located at: "${s.RootDir}/${s.ID}.state".
+type StateFile struct {
+	// RootDir is the directory containing the container metadata file.
+	RootDir string `json:"rootDir"`
+
+	// ID is the container ID.
+	ID string `json:"id"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
+	once  sync.Once
+	flock *flock.Flock
+}
+
+// List returns all container ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+	log.Debugf("List containers %q", rootDir)
+	list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension))
+	if err != nil {
+		return nil, err
+	}
+	var out []string
+	for _, path := range list {
+		// Filter out files that do no belong to a container.
+		fileName := filepath.Base(path)
+		if len(fileName) < len(stateFileExtension) {
+			panic(fmt.Sprintf("invalid file match %q", path))
+		}
+		// Remove the extension.
+		cid := fileName[:len(fileName)-len(stateFileExtension)]
+		if validateID(cid) == nil {
+			out = append(out, cid)
+		}
+	}
+	return out, nil
+}
+
+// lock globally locks all locking operations for the container.
+func (s *StateFile) lock() error {
+	s.once.Do(func() {
+		s.flock = flock.NewFlock(s.lockPath())
+	})
+
+	if err := s.flock.Lock(); err != nil {
+		return fmt.Errorf("acquiring lock on %q: %v", s.flock, err)
+	}
+	return nil
+}
+
+// lockForNew acquires the lock and checks if the state file doesn't exist. This
+// is done to ensure that more than one creation didn't race to create
+// containers with the same ID.
+func (s *StateFile) lockForNew() error {
+	if err := s.lock(); err != nil {
+		return err
+	}
+
+	// Checks if the container already exists by looking for the metadata file.
+	if _, err := os.Stat(s.statePath()); err == nil {
+		s.unlock()
+		return fmt.Errorf("container already exists")
+	} else if !os.IsNotExist(err) {
+		s.unlock()
+		return fmt.Errorf("looking for existing container: %v", err)
+	}
+	return nil
+}
+
+// unlock globally unlocks all locking operations for the container.
+func (s *StateFile) unlock() error {
+	if !s.flock.Locked() {
+		panic("unlock called without lock held")
+	}
+
+	if err := s.flock.Unlock(); err != nil {
+		log.Warningf("Error to release lock on %q: %v", s.flock, err)
+		return fmt.Errorf("releasing lock on %q: %v", s.flock, err)
+	}
+	return nil
+}
+
+// saveLocked saves 'v' to the state file.
+//
+// Preconditions: lock() must been called before.
+func (s *StateFile) saveLocked(v interface{}) error {
+	if !s.flock.Locked() {
+		panic("saveLocked called without lock held")
+	}
+
+	meta, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(s.statePath(), meta, 0640); err != nil {
+		return fmt.Errorf("writing json file: %v", err)
+	}
+	return nil
+}
+
+func (s *StateFile) load(v interface{}) error {
+	if err := s.lock(); err != nil {
+		return err
+	}
+	defer s.unlock()
+
+	metaBytes, err := ioutil.ReadFile(s.statePath())
+	if err != nil {
+		return err
+	}
+	return json.Unmarshal(metaBytes, &v)
+}
+
+func (s *StateFile) close() error {
+	if s.flock == nil {
+		return nil
+	}
+	if s.flock.Locked() {
+		panic("Closing locked file")
+	}
+	return s.flock.Close()
+}
+
+func buildStatePath(rootDir, id string) string {
+	return filepath.Join(rootDir, id+stateFileExtension)
+}
+
+// statePath is the full path to the state file.
+func (s *StateFile) statePath() string {
+	return buildStatePath(s.RootDir, s.ID)
+}
+
+// lockPath is the full path to the lock file.
+func (s *StateFile) lockPath() string {
+	return filepath.Join(s.RootDir, s.ID+".lock")
+}
+
+// destroy deletes all state created by the stateFile. It may be called with the
+// lock file held. In that case, the lock file must still be unlocked and
+// properly closed after destroy returns.
+func (s *StateFile) destroy() error {
+	if err := os.Remove(s.statePath()); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	if err := os.Remove(s.lockPath()); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index 9bf9e6e9d..bfd338bb6 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -15,5 +15,6 @@ go_binary(
         "//pkg/unet",
         "//runsc/testutil",
         "@com_github_google_subcommands//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
     ],
 )
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 7f735c254..a1c8a741a 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -19,25 +19,31 @@ package main
 import (
 	"context"
 	"fmt"
+	"io"
+	"io/ioutil"
 	"log"
 	"net"
 	"os"
 	"os/exec"
+	"regexp"
 	"strconv"
 	sys "syscall"
 	"time"
 
 	"flag"
 	"github.com/google/subcommands"
+	"github.com/kr/pty"
 	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func main() {
 	subcommands.Register(subcommands.HelpCommand(), "")
 	subcommands.Register(subcommands.FlagsCommand(), "")
+	subcommands.Register(new(capability), "")
 	subcommands.Register(new(fdReceiver), "")
 	subcommands.Register(new(fdSender), "")
 	subcommands.Register(new(forkBomb), "")
+	subcommands.Register(new(ptyRunner), "")
 	subcommands.Register(new(reaper), "")
 	subcommands.Register(new(syscall), "")
 	subcommands.Register(new(taskTree), "")
@@ -287,3 +293,102 @@ func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interfac
 	}
 	return subcommands.ExitSuccess
 }
+
+type capability struct {
+	enabled  uint64
+	disabled uint64
+}
+
+// Name implements subcommands.Command.
+func (*capability) Name() string {
+	return "capability"
+}
+
+// Synopsis implements subcommands.Command.
+func (*capability) Synopsis() string {
+	return "checks if effective capabilities are set/unset"
+}
+
+// Usage implements subcommands.Command.
+func (*capability) Usage() string {
+	return "capability [--enabled=number] [--disabled=number]"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *capability) SetFlags(f *flag.FlagSet) {
+	f.Uint64Var(&c.enabled, "enabled", 0, "")
+	f.Uint64Var(&c.disabled, "disabled", 0, "")
+}
+
+// Execute implements subcommands.Command.
+func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if c.enabled == 0 && c.disabled == 0 {
+		fmt.Println("One of the flags must be set")
+		return subcommands.ExitUsageError
+	}
+
+	status, err := ioutil.ReadFile("/proc/self/status")
+	if err != nil {
+		fmt.Printf("Error reading %q: %v\n", "proc/self/status", err)
+		return subcommands.ExitFailure
+	}
+	re := regexp.MustCompile("CapEff:\t([0-9a-f]+)\n")
+	matches := re.FindStringSubmatch(string(status))
+	if matches == nil || len(matches) != 2 {
+		fmt.Printf("Effective capabilities not found in\n%s\n", status)
+		return subcommands.ExitFailure
+	}
+	caps, err := strconv.ParseUint(matches[1], 16, 64)
+	if err != nil {
+		fmt.Printf("failed to convert capabilities %q: %v\n", matches[1], err)
+		return subcommands.ExitFailure
+	}
+
+	if c.enabled != 0 && (caps&c.enabled) != c.enabled {
+		fmt.Printf("Missing capabilities, want: %#x: got: %#x\n", c.enabled, caps)
+		return subcommands.ExitFailure
+	}
+	if c.disabled != 0 && (caps&c.disabled) != 0 {
+		fmt.Printf("Extra capabilities found, dont_want: %#x: got: %#x\n", c.disabled, caps)
+		return subcommands.ExitFailure
+	}
+
+	return subcommands.ExitSuccess
+}
+
+type ptyRunner struct{}
+
+// Name implements subcommands.Command.
+func (*ptyRunner) Name() string {
+	return "pty-runner"
+}
+
+// Synopsis implements subcommands.Command.
+func (*ptyRunner) Synopsis() string {
+	return "runs the given command with an open pty terminal"
+}
+
+// Usage implements subcommands.Command.
+func (*ptyRunner) Usage() string {
+	return "pty-runner [command]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*ptyRunner) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
+	c := exec.Command(fs.Args()[0], fs.Args()[1:]...)
+	f, err := pty.Start(c)
+	if err != nil {
+		fmt.Printf("pty.Start failed: %v", err)
+		return subcommands.ExitFailure
+	}
+	defer f.Close()
+
+	// Copy stdout from the command to keep this process alive until the
+	// subprocess exits.
+	io.Copy(os.Stdout, f)
+
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/criutil/criutil.go b/runsc/criutil/criutil.go
index c8ddf5a9a..773f5a1c4 100644
--- a/runsc/criutil/criutil.go
+++ b/runsc/criutil/criutil.go
@@ -157,13 +157,55 @@ func (cc *Crictl) RmPod(podID string) error {
 	return err
 }
 
-// StartPodAndContainer pulls an image, then starts a sandbox and container in
-// that sandbox. It returns the pod ID and container ID.
-func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
+// StartContainer pulls the given image ands starts the container in the
+// sandbox with the given podID.
+func (cc *Crictl) StartContainer(podID, image, sbSpec, contSpec string) (string, error) {
+	// Write the specs to files that can be read by crictl.
+	sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
+	if err != nil {
+		return "", fmt.Errorf("failed to write sandbox spec: %v", err)
+	}
+	contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec)
+	if err != nil {
+		return "", fmt.Errorf("failed to write container spec: %v", err)
+	}
+
+	return cc.startContainer(podID, image, sbSpecFile, contSpecFile)
+}
+
+func (cc *Crictl) startContainer(podID, image, sbSpecFile, contSpecFile string) (string, error) {
 	if err := cc.Pull(image); err != nil {
-		return "", "", fmt.Errorf("failed to pull %s: %v", image, err)
+		return "", fmt.Errorf("failed to pull %s: %v", image, err)
+	}
+
+	contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
+	if err != nil {
+		return "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
+	}
+
+	if _, err := cc.Start(contID); err != nil {
+		return "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
+	}
+
+	return contID, nil
+}
+
+// StopContainer stops and deletes the container with the given container ID.
+func (cc *Crictl) StopContainer(contID string) error {
+	if err := cc.Stop(contID); err != nil {
+		return fmt.Errorf("failed to stop container %q: %v", contID, err)
+	}
+
+	if err := cc.Rm(contID); err != nil {
+		return fmt.Errorf("failed to remove container %q: %v", contID, err)
 	}
 
+	return nil
+}
+
+// StartPodAndContainer pulls an image, then starts a sandbox and container in
+// that sandbox. It returns the pod ID and container ID.
+func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
 	// Write the specs to files that can be read by crictl.
 	sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
 	if err != nil {
@@ -179,28 +221,17 @@ func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string,
 		return "", "", err
 	}
 
-	contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
-	if err != nil {
-		return "", "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
-	}
+	contID, err := cc.startContainer(podID, image, sbSpecFile, contSpecFile)
 
-	if _, err := cc.Start(contID); err != nil {
-		return "", "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
-	}
-
-	return podID, contID, nil
+	return podID, contID, err
 }
 
 // StopPodAndContainer stops a container and pod.
 func (cc *Crictl) StopPodAndContainer(podID, contID string) error {
-	if err := cc.Stop(contID); err != nil {
+	if err := cc.StopContainer(contID); err != nil {
 		return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err)
 	}
 
-	if err := cc.Rm(contID); err != nil {
-		return fmt.Errorf("failed to remove container %q in pod %q: %v", contID, podID, err)
-	}
-
 	if err := cc.StopPod(podID); err != nil {
 		return fmt.Errorf("failed to stop pod %q: %v", podID, err)
 	}
diff --git a/runsc/debian/description b/runsc/debian/description
index 6e3b1b2c0..9e8e08805 100644
--- a/runsc/debian/description
+++ b/runsc/debian/description
@@ -1,5 +1 @@
-gVisor is a user-space kernel, written in Go, that implements a substantial
-portion of the Linux system surface. It includes an Open Container Initiative
-(OCI) runtime called runsc that provides an isolation boundary between the
-application and the host kernel. The runsc runtime integrates with Docker and
-Kubernetes, making it simple to run sandboxed containers.
+gVisor container sandbox runtime
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 41f5fe1e8..9b6346ca2 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -240,7 +240,7 @@ func (d *Docker) Stop() error {
 // Run calls 'docker run' with the arguments provided. The container starts
 // running in the background and the call returns immediately.
 func (d *Docker) Run(args ...string) error {
-	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
+	a := d.runArgs("-d")
 	a = append(a, args...)
 	_, err := do(a...)
 	if err == nil {
@@ -251,7 +251,7 @@ func (d *Docker) Run(args ...string) error {
 
 // RunWithPty is like Run but with an attached pty.
 func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
-	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-it"}
+	a := d.runArgs("-it")
 	a = append(a, args...)
 	return doWithPty(a...)
 }
@@ -259,8 +259,7 @@ func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
 // RunFg calls 'docker run' with the arguments provided in the foreground. It
 // blocks until the container exits and returns the output.
 func (d *Docker) RunFg(args ...string) (string, error) {
-	a := []string{"run", "--runtime", d.Runtime, "--name", d.Name}
-	a = append(a, args...)
+	a := d.runArgs(args...)
 	out, err := do(a...)
 	if err == nil {
 		d.logDockerID()
@@ -268,6 +267,14 @@ func (d *Docker) RunFg(args ...string) (string, error) {
 	return string(out), err
 }
 
+func (d *Docker) runArgs(args ...string) []string {
+	// Environment variable RUNSC_TEST_NAME is picked up by the runtime and added
+	// to the log name, so one can easily identify the corresponding logs for
+	// this test.
+	rv := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-e", "RUNSC_TEST_NAME=" + d.Name}
+	return append(rv, args...)
+}
+
 // Logs calls 'docker logs'.
 func (d *Docker) Logs() (string, error) {
 	return do("logs", d.Name)
@@ -275,7 +282,22 @@ func (d *Docker) Logs() (string, error) {
 
 // Exec calls 'docker exec' with the arguments provided.
 func (d *Docker) Exec(args ...string) (string, error) {
-	a := []string{"exec", d.Name}
+	return d.ExecWithFlags(nil, args...)
+}
+
+// ExecWithFlags calls 'docker exec <flags> name <args>'.
+func (d *Docker) ExecWithFlags(flags []string, args ...string) (string, error) {
+	a := []string{"exec"}
+	a = append(a, flags...)
+	a = append(a, d.Name)
+	a = append(a, args...)
+	return do(a...)
+}
+
+// ExecAsUser calls 'docker exec' as the given user with the arguments
+// provided.
+func (d *Docker) ExecAsUser(user string, args ...string) (string, error) {
+	a := []string{"exec", "--user", user, d.Name}
 	a = append(a, args...)
 	return do(a...)
 }
@@ -358,6 +380,16 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) {
 	return port, nil
 }
 
+// FindIP returns the IP address of the container as a string.
+func (d *Docker) FindIP() (string, error) {
+	const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
+	out, err := do("inspect", "-f", format, d.Name)
+	if err != nil {
+		return "", fmt.Errorf("error retrieving IP: %v", err)
+	}
+	return strings.TrimSpace(out), nil
+}
+
 // SandboxPid returns the PID to the sandbox process.
 func (d *Docker) SandboxPid() (int, error) {
 	out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 80a4aa2fe..afcb41801 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "fsgofer",
     srcs = [
         "fsgofer.go",
+        "fsgofer_amd64_unsafe.go",
+        "fsgofer_arm64_unsafe.go",
         "fsgofer_unsafe.go",
     ],
     importpath = "gvisor.dev/gvisor/runsc/fsgofer",
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index e2318a978..bac73f89d 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -6,6 +6,8 @@ go_library(
     name = "filter",
     srcs = [
         "config.go",
+        "config_amd64.go",
+        "config_arm64.go",
         "extra_filters.go",
         "extra_filters_msan.go",
         "extra_filters_race.go",
@@ -17,6 +19,7 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/flipcall",
         "//pkg/log",
         "//pkg/seccomp",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 8ddfa77d6..a1792330f 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -25,11 +25,7 @@ import (
 
 // allowedSyscalls is the set of syscalls executed by the gofer.
 var allowedSyscalls = seccomp.SyscallRules{
-	syscall.SYS_ACCEPT: {},
-	syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
-		{seccomp.AllowValue(linux.ARCH_GET_FS)},
-		{seccomp.AllowValue(linux.ARCH_SET_FS)},
-	},
+	syscall.SYS_ACCEPT:        {},
 	syscall.SYS_CLOCK_GETTIME: {},
 	syscall.SYS_CLONE: []seccomp.Rule{
 		{
@@ -83,6 +79,11 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(syscall.F_GETFD),
 		},
+		// Used by flipcall.PacketWindowAllocator.Init().
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(unix.F_ADD_SEALS),
+		},
 	},
 	syscall.SYS_FSTAT:     {},
 	syscall.SYS_FSTATFS:   {},
@@ -103,6 +104,19 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(0),
 		},
+		// Non-private futex used for flipcall.
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+		seccomp.Rule{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
 	},
 	syscall.SYS_GETDENTS64:   {},
 	syscall.SYS_GETPID:       {},
@@ -112,6 +126,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_LINKAT:       {},
 	syscall.SYS_LSEEK:        {},
 	syscall.SYS_MADVISE:      {},
+	unix.SYS_MEMFD_CREATE:    {}, /// Used by flipcall.PacketWindowAllocator.Init().
 	syscall.SYS_MKDIRAT:      {},
 	syscall.SYS_MMAP: []seccomp.Rule{
 		{
@@ -136,7 +151,6 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_MPROTECT:   {},
 	syscall.SYS_MUNMAP:     {},
 	syscall.SYS_NANOSLEEP:  {},
-	syscall.SYS_NEWFSTATAT: {},
 	syscall.SYS_OPENAT:     {},
 	syscall.SYS_PPOLL:      {},
 	syscall.SYS_PREAD64:    {},
@@ -158,8 +172,16 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_RENAMEAT:        {},
 	syscall.SYS_RESTART_SYSCALL: {},
 	syscall.SYS_RT_SIGPROCMASK:  {},
+	syscall.SYS_RT_SIGRETURN:    {},
 	syscall.SYS_SCHED_YIELD:     {},
 	syscall.SYS_SENDMSG: []seccomp.Rule{
+		// Used by fdchannel.Endpoint.SendFD().
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(0),
+		},
+		// Used by unet.SocketWriter.WriteVec().
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
@@ -170,7 +192,15 @@ var allowedSyscalls = seccomp.SyscallRules{
 		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK: {},
-	syscall.SYS_SYMLINKAT:   {},
+	// Used by fdchannel.NewConnectedSockets().
+	syscall.SYS_SOCKETPAIR: {
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_SYMLINKAT: {},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
 			seccomp.AllowValue(uint64(os.Getpid())),
@@ -180,3 +210,28 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_UTIMENSAT: {},
 	syscall.SYS_WRITE:     {},
 }
+
+var udsSyscalls = seccomp.SyscallRules{
+	syscall.SYS_SOCKET: []seccomp.Rule{
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_STREAM),
+			seccomp.AllowValue(0),
+		},
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_DGRAM),
+			seccomp.AllowValue(0),
+		},
+		{
+			seccomp.AllowValue(syscall.AF_UNIX),
+			seccomp.AllowValue(syscall.SOCK_SEQPACKET),
+			seccomp.AllowValue(0),
+		},
+	},
+	syscall.SYS_CONNECT: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+		},
+	},
+}
diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go
new file mode 100644
index 000000000..a4b28cb8b
--- /dev/null
+++ b/runsc/fsgofer/filter/config_amd64.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+		{seccomp.AllowValue(linux.ARCH_GET_FS)},
+		{seccomp.AllowValue(linux.ARCH_SET_FS)},
+	}
+
+	allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{}
+}
diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go
new file mode 100644
index 000000000..d2697deb7
--- /dev/null
+++ b/runsc/fsgofer/filter/config_arm64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+	allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{}
+}
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index 65053415f..289886720 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -23,11 +23,16 @@ import (
 
 // Install installs seccomp filters.
 func Install() error {
-	s := allowedSyscalls
-
 	// Set of additional filters used by -race and -msan. Returns empty
 	// when not enabled.
-	s.Merge(instrumentationFilters())
+	allowedSyscalls.Merge(instrumentationFilters())
+
+	return seccomp.Install(allowedSyscalls)
+}
 
-	return seccomp.Install(s)
+// InstallUDSFilters extends the allowed syscalls to include those necessary for
+// connecting to a host UDS.
+func InstallUDSFilters() {
+	// Add additional filters required for connecting to the host's sockets.
+	allowedSyscalls.Merge(udsSyscalls)
 }
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 7c4d2b94e..b59e1a70e 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -54,6 +54,7 @@ const (
 	regular fileType = iota
 	directory
 	symlink
+	socket
 	unknown
 )
 
@@ -66,6 +67,8 @@ func (f fileType) String() string {
 		return "directory"
 	case symlink:
 		return "symlink"
+	case socket:
+		return "socket"
 	}
 	return "unknown"
 }
@@ -82,6 +85,9 @@ type Config struct {
 
 	// PanicOnWrite panics on attempts to write to RO mounts.
 	PanicOnWrite bool
+
+	// HostUDS signals whether the gofer can mount a host's UDS.
+	HostUDS bool
 }
 
 type attachPoint struct {
@@ -119,35 +125,31 @@ func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) {
 
 // Attach implements p9.Attacher.
 func (a *attachPoint) Attach() (p9.File, error) {
-	// dirFD (1st argument) is ignored because 'prefix' is always absolute.
-	stat, err := statAt(-1, a.prefix)
-	if err != nil {
-		return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err)
-	}
-	mode := syscall.O_RDWR
-	if a.conf.ROMount || (stat.Mode&syscall.S_IFMT) == syscall.S_IFDIR {
-		mode = syscall.O_RDONLY
+	a.attachedMu.Lock()
+	defer a.attachedMu.Unlock()
+
+	if a.attached {
+		return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
 	}
 
-	// Open the root directory.
-	f, err := fd.Open(a.prefix, openFlags|mode, 0)
+	f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
+		return fd.Open(a.prefix, openFlags|mode, 0)
+	})
 	if err != nil {
-		return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err)
+		return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err)
 	}
 
-	a.attachedMu.Lock()
-	defer a.attachedMu.Unlock()
-	if a.attached {
-		f.Close()
-		return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
+	stat, err := stat(f.FD())
+	if err != nil {
+		return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
 	}
 
-	rv, err := newLocalFile(a, f, a.prefix, stat)
+	lf, err := newLocalFile(a, f, a.prefix, stat)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err)
 	}
 	a.attached = true
-	return rv, nil
+	return lf, nil
 }
 
 // makeQID returns a unique QID for the given stat buffer.
@@ -197,6 +199,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
 // The reason that the file is not opened initially as read-write is for better
 // performance with 'overlay2' storage driver. overlay2 eagerly copies the
 // entire file up when it's opened in write mode, and would perform badly when
+// multiple files are only being opened for read (esp. startup).
 type localFile struct {
 	p9.DefaultWalkGetAttr
 
@@ -263,10 +266,10 @@ func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, erro
 // actual file open and is customizable by the caller.
 func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
 	// Attempt to open file in the following mode in order:
-	//   1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too.
-	//      Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option
-	//      has no effect on regular files.
-	//   2. PATH: for symlinks
+	//   1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
+	//      Use non-blocking to prevent getting stuck inside open(2) for
+	//      FIFOs. This option has no effect on regular files.
+	//   2. PATH: for symlinks, sockets.
 	modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
 
 	var err error
@@ -295,7 +298,7 @@ func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error)
 	return file, nil
 }
 
-func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
+func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) {
 	var ft fileType
 	switch stat.Mode & syscall.S_IFMT {
 	case syscall.S_IFREG:
@@ -304,6 +307,11 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
 		ft = directory
 	case syscall.S_IFLNK:
 		ft = symlink
+	case syscall.S_IFSOCK:
+		if !permitSocket {
+			return unknown, syscall.EPERM
+		}
+		ft = socket
 	default:
 		return unknown, syscall.EPERM
 	}
@@ -311,7 +319,7 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
 }
 
 func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
-	ft, err := getSupportedFileType(stat)
+	ft, err := getSupportedFileType(stat, a.conf.HostUDS)
 	if err != nil {
 		return nil, err
 	}
@@ -359,23 +367,24 @@ func fchown(fd int, uid p9.UID, gid p9.GID) error {
 }
 
 // Open implements p9.File.
-func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	if l.isOpen() {
 		panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
 	}
 
 	// Check if control file can be used or if a new open must be created.
 	var newFile *fd.FD
-	if mode == p9.ReadOnly {
-		log.Debugf("Open reusing control file, mode: %v, %q", mode, l.hostPath)
+	if flags == p9.ReadOnly {
+		log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath)
 		newFile = l.file
 	} else {
 		// Ideally reopen would call name_to_handle_at (with empty name) and
 		// open_by_handle_at to reopen the file without using 'hostPath'. However,
 		// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
-		log.Debugf("Open reopening file, mode: %v, %q", mode, l.hostPath)
+		log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath)
 		var err error
-		newFile, err = reopenProcFd(l.file, openFlags|mode.OSFlags())
+		// Constrain open flags to the open mode and O_TRUNC.
+		newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC)))
 		if err != nil {
 			return nil, p9.QID{}, 0, extractErrno(err)
 		}
@@ -402,7 +411,7 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 		}
 		l.file = newFile
 	}
-	l.mode = mode
+	l.mode = flags & p9.OpenFlagsModeMask
 	return fd, l.attachPoint.makeQID(stat), 0, nil
 }
 
@@ -594,7 +603,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
 		Mode:             p9.FileMode(stat.Mode),
 		UID:              p9.UID(stat.Uid),
 		GID:              p9.GID(stat.Gid),
-		NLink:            stat.Nlink,
+		NLink:            uint64(stat.Nlink),
 		RDev:             stat.Rdev,
 		Size:             uint64(stat.Size),
 		BlockSize:        uint64(stat.Blksize),
@@ -948,14 +957,14 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
 }
 
 func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) {
+	var dirents []p9.Dirent
+
 	// Limit 'count' to cap the slice size that is returned.
 	const maxCount = 100000
 	if count > maxCount {
 		count = maxCount
 	}
 
-	dirents := make([]p9.Dirent, 0, count)
-
 	// Pre-allocate buffers that will be reused to get partial results.
 	direntsBuf := make([]byte, 8192)
 	names := make([]string, 0, 100)
@@ -1025,8 +1034,48 @@ func (l *localFile) Flush() error {
 }
 
 // Connect implements p9.File.
-func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
-	return nil, syscall.ECONNREFUSED
+func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
+	if !l.attachPoint.conf.HostUDS {
+		return nil, syscall.ECONNREFUSED
+	}
+
+	// TODO(gvisor.dev/issue/1003): Due to different app vs replacement
+	// mappings, the app path may have fit in the sockaddr, but we can't
+	// fit f.path in our sockaddr. We'd need to redirect through a shorter
+	// path in order to actually connect to this socket.
+	if len(l.hostPath) > linux.UnixPathMax {
+		return nil, syscall.ECONNREFUSED
+	}
+
+	var stype int
+	switch flags {
+	case p9.StreamSocket:
+		stype = syscall.SOCK_STREAM
+	case p9.DgramSocket:
+		stype = syscall.SOCK_DGRAM
+	case p9.SeqpacketSocket:
+		stype = syscall.SOCK_SEQPACKET
+	default:
+		return nil, syscall.ENXIO
+	}
+
+	f, err := syscall.Socket(syscall.AF_UNIX, stype, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := syscall.SetNonblock(f, true); err != nil {
+		syscall.Close(f)
+		return nil, err
+	}
+
+	sa := syscall.SockaddrUnix{Name: l.hostPath}
+	if err := syscall.Connect(f, &sa); err != nil {
+		syscall.Close(f)
+		return nil, err
+	}
+
+	return fd.New(f), nil
 }
 
 // Close implements p9.File.
diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go
new file mode 100644
index 000000000..5d4aab597
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+	nameBytes, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return syscall.Stat_t{}, err
+	}
+	namePtr := unsafe.Pointer(nameBytes)
+
+	var stat syscall.Stat_t
+	statPtr := unsafe.Pointer(&stat)
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_NEWFSTATAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(statPtr),
+		linux.AT_SYMLINK_NOFOLLOW,
+		0,
+		0); errno != 0 {
+
+		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+	}
+	return stat, nil
+}
diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go
new file mode 100644
index 000000000..8041fd352
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package fsgofer
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+	nameBytes, err := syscall.BytePtrFromString(name)
+	if err != nil {
+		return syscall.Stat_t{}, err
+	}
+	namePtr := unsafe.Pointer(nameBytes)
+
+	var stat syscall.Stat_t
+	statPtr := unsafe.Pointer(&stat)
+
+	if _, _, errno := syscall.Syscall6(
+		syscall.SYS_FSTATAT,
+		uintptr(dirFd),
+		uintptr(namePtr),
+		uintptr(statPtr),
+		linux.AT_SYMLINK_NOFOLLOW,
+		0,
+		0); errno != 0 {
+
+		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+	}
+	return stat, nil
+}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index cbbe71019..05af7e397 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -665,7 +665,7 @@ func TestAttachInvalidType(t *testing.T) {
 			}
 			f, err := a.Attach()
 			if f != nil || err == nil {
-				t.Fatalf("Attach should have failed, got (%v, nil)", f)
+				t.Fatalf("Attach should have failed, got (%v, %v)", f, err)
 			}
 		})
 	}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index ff2556aee..542b54365 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -18,34 +18,9 @@ import (
 	"syscall"
 	"unsafe"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/syserr"
 )
 
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
-	nameBytes, err := syscall.BytePtrFromString(name)
-	if err != nil {
-		return syscall.Stat_t{}, err
-	}
-	namePtr := unsafe.Pointer(nameBytes)
-
-	var stat syscall.Stat_t
-	statPtr := unsafe.Pointer(&stat)
-
-	if _, _, errno := syscall.Syscall6(
-		syscall.SYS_NEWFSTATAT,
-		uintptr(dirFd),
-		uintptr(namePtr),
-		uintptr(statPtr),
-		linux.AT_SYMLINK_NOFOLLOW,
-		0,
-		0); errno != 0 {
-
-		return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
-	}
-	return stat, nil
-}
-
 func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
 	// utimensat(2) doesn't accept empty name, instead name must be nil to make it
 	// operate directly on 'dirFd' unlike other *at syscalls.
diff --git a/runsc/main.go b/runsc/main.go
index 0ff68160d..abf929511 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -26,6 +26,7 @@ import (
 	"path/filepath"
 	"strings"
 	"syscall"
+	"time"
 
 	"flag"
 
@@ -41,34 +42,39 @@ import (
 var (
 	// Although these flags are not part of the OCI spec, they are used by
 	// Docker, and thus should not be changed.
-	rootDir     = flag.String("root", "", "root directory for storage of container state")
-	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
-	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s")
-	debug       = flag.Bool("debug", false, "enable debug logging")
-	showVersion = flag.Bool("version", false, "show version and exit")
+	rootDir     = flag.String("root", "", "root directory for storage of container state.")
+	logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.")
+	logFormat   = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
+	debug       = flag.Bool("debug", false, "enable debug logging.")
+	showVersion = flag.Bool("version", false, "show version and exit.")
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
 
 	// These flags are unique to runsc, and are used to configure parts of the
 	// system that are not covered by the runtime spec.
 
 	// Debugging flags.
 	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-	logPackets      = flag.Bool("log-packets", false, "enable network packet logging")
+	logPackets      = flag.Bool("log-packets", false, "enable network packet logging.")
 	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
 	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
-	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
-	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr")
+	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
+	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
 
 	// Debugging flags: strace related
-	strace         = flag.Bool("strace", false, "enable strace")
+	strace         = flag.Bool("strace", false, "enable strace.")
 	straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
-	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
+	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
 
 	// Flags that control sandbox runtime behavior.
-	platformName       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+	platformName       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
 	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	gso                = flag.Bool("gso", true, "enable generic segmenation offload")
+	hardwareGSO        = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
+	softwareGSO        = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.")
 	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+	fsGoferHostUDS     = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
 	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+	overlayfsStaleRead = flag.Bool("overlayfs-stale-read", false, "reopen cached FDs after a file is opened for write to workaround overlayfs limitation on kernels before 4.19.")
 	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
 	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
 	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
@@ -76,9 +82,11 @@ var (
 	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
 	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
 	referenceLeakMode  = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+	cpuNumFromQuota    = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
 
 	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
+	testOnlyTestNameEnv                        = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
 )
 
 func main() {
@@ -132,6 +140,12 @@ func main() {
 		os.Exit(0)
 	}
 
+	// TODO(gvisor.dev/issue/193): support systemd cgroups
+	if *systemdCgroup {
+		fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
+		os.Exit(1)
+	}
+
 	var errorLogger io.Writer
 	if *logFD > -1 {
 		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
@@ -194,9 +208,11 @@ func main() {
 		DebugLog:           *debugLog,
 		DebugLogFormat:     *debugLogFormat,
 		FileAccess:         fsAccess,
+		FSGoferHostUDS:     *fsGoferHostUDS,
 		Overlay:            *overlay,
 		Network:            netType,
-		GSO:                *gso,
+		HardwareGSO:        *hardwareGSO,
+		SoftwareGSO:        *softwareGSO,
 		LogPackets:         *logPackets,
 		Platform:           platformType,
 		Strace:             *strace,
@@ -209,8 +225,11 @@ func main() {
 		Rootless:           *rootless,
 		AlsoLogToStderr:    *alsoLogToStderr,
 		ReferenceLeakMode:  refsLeakMode,
+		OverlayfsStaleRead: *overlayfsStaleRead,
+		CPUNumFromQuota:    *cpuNumFromQuota,
 
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
+		TestOnlyTestNameEnv:                        *testOnlyTestNameEnv,
 	}
 	if len(*straceSyscalls) != 0 {
 		conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
@@ -221,6 +240,18 @@ func main() {
 		log.SetLevel(log.Debug)
 	}
 
+	// Logging will include the local date and time via the time package.
+	//
+	// On first use, time.Local initializes the local time zone, which
+	// involves opening tzdata files on the host. Since this requires
+	// opening host files, it must be done before syscall filter
+	// installation.
+	//
+	// Generally there will be a log message before filter installation
+	// that will force initialization, but force initialization here in
+	// case that does not occur.
+	_ = time.Local.String()
+
 	subcommand := flag.CommandLine.Arg(0)
 
 	var e log.Emitter
@@ -237,14 +268,14 @@ func main() {
 		// want with them. Since Docker and Containerd both eat boot's stderr, we
 		// dup our stderr to the provided log FD so that panics will appear in the
 		// logs, rather than just disappear.
-		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
+		if err := syscall.Dup3(int(f.Fd()), int(os.Stderr.Fd()), 0); err != nil {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
 		}
 
 		e = newEmitter(*debugLogFormat, f)
 
 	} else if *debugLog != "" {
-		f, err := specutils.DebugLogFile(*debugLog, subcommand)
+		f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */)
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
 		}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 7fdceaab6..8001949d5 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -19,6 +19,8 @@ go_library(
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/platform",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
         "//pkg/urpc",
         "//runsc/boot",
         "//runsc/boot/platforms",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 5634f0707..be8b72b3e 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -28,6 +28,8 @@ import (
 	"github.com/vishvananda/netlink"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -61,7 +63,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
-		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil {
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
 	case boot.NetworkHost:
@@ -136,7 +138,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
@@ -182,36 +184,39 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 			continue
 		}
 
-		// Keep only IPv4 addresses.
-		var ip4addrs []*net.IPNet
+		var ipAddrs []*net.IPNet
 		for _, ifaddr := range allAddrs {
 			ipNet, ok := ifaddr.(*net.IPNet)
 			if !ok {
 				return fmt.Errorf("address is not IPNet: %+v", ifaddr)
 			}
-			if ipNet.IP.To4() == nil {
-				log.Warningf("IPv6 is not supported, skipping: %v", ipNet)
-				continue
-			}
-			ip4addrs = append(ip4addrs, ipNet)
+			ipAddrs = append(ipAddrs, ipNet)
 		}
-		if len(ip4addrs) == 0 {
-			log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name)
+		if len(ipAddrs) == 0 {
+			log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
 			continue
 		}
 
 		// Scrape the routes before removing the address, since that
 		// will remove the routes as well.
-		routes, def, err := routesForIface(iface)
+		routes, defv4, defv6, err := routesForIface(iface)
 		if err != nil {
 			return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
 		}
-		if def != nil {
-			if !args.DefaultGateway.Route.Empty() {
-				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+		if defv4 != nil {
+			if !args.Defaultv4Gateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
 			}
-			args.DefaultGateway.Route = *def
-			args.DefaultGateway.Name = iface.Name
+			args.Defaultv4Gateway.Route = *defv4
+			args.Defaultv4Gateway.Name = iface.Name
+		}
+
+		if defv6 != nil {
+			if !args.Defaultv6Gateway.Route.Empty() {
+				return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
+			}
+			args.Defaultv6Gateway.Route = *defv6
+			args.Defaultv6Gateway.Name = iface.Name
 		}
 
 		link := boot.FDBasedLink{
@@ -232,7 +237,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 		// Create the socket for the device.
 		for i := 0; i < link.NumChannels; i++ {
 			log.Debugf("Creating Channel %d", i)
-			socketEntry, err := createSocket(iface, ifaceLink, enableGSO)
+			socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO)
 			if err != nil {
 				return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
 			}
@@ -247,9 +252,15 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 			args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
 		}
 
+		if link.GSOMaxSize == 0 && softwareGSO {
+			// Hardware GSO is disabled. Let's enable software GSO.
+			link.GSOMaxSize = stack.SoftwareGSOMaxSize
+			link.SoftwareGSOEnabled = true
+		}
+
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
-		for _, addr := range ip4addrs {
+		for _, addr := range ipAddrs {
 			link.Addresses = append(link.Addresses, addr.IP)
 
 			// Steal IP address from NIC.
@@ -345,46 +356,56 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink,
 }
 
 // routesForIface iterates over all routes for the given interface and converts
-// them to boot.Routes.
-func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+// them to boot.Routes. It also returns the a default v4/v6 route if found.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
 	link, err := netlink.LinkByIndex(iface.Index)
 	if err != nil {
-		return nil, nil, err
+		return nil, nil, nil, err
 	}
 	rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
 	if err != nil {
-		return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
+		return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
 	}
 
-	var def *boot.Route
+	var defv4, defv6 *boot.Route
 	var routes []boot.Route
 	for _, r := range rs {
 		// Is it a default route?
 		if r.Dst == nil {
 			if r.Gw == nil {
-				return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
-			}
-			if r.Gw.To4() == nil {
-				log.Warningf("IPv6 is not supported, skipping default route: %v", r)
-				continue
-			}
-			if def != nil {
-				return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+				return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
 			}
 			// Create a catch all route to the gateway.
-			def = &boot.Route{
-				Destination: net.IPNet{
-					IP:   net.IPv4zero,
-					Mask: net.IPMask(net.IPv4zero),
-				},
-				Gateway: r.Gw,
+			switch len(r.Gw) {
+			case header.IPv4AddressSize:
+				if defv4 != nil {
+					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
+				}
+				defv4 = &boot.Route{
+					Destination: net.IPNet{
+						IP:   net.IPv4zero,
+						Mask: net.IPMask(net.IPv4zero),
+					},
+					Gateway: r.Gw,
+				}
+			case header.IPv6AddressSize:
+				if defv6 != nil {
+					return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
+				}
+
+				defv6 = &boot.Route{
+					Destination: net.IPNet{
+						IP:   net.IPv6zero,
+						Mask: net.IPMask(net.IPv6zero),
+					},
+					Gateway: r.Gw,
+				}
+			default:
+				return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
 			}
 			continue
 		}
-		if r.Dst.IP.To4() == nil {
-			log.Warningf("IPv6 is not supported, skipping route: %v", r)
-			continue
-		}
+
 		dst := *r.Dst
 		dst.IP = dst.IP.Mask(dst.Mask)
 		routes = append(routes, boot.Route{
@@ -392,7 +413,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
 			Gateway:     r.Gw,
 		})
 	}
-	return routes, def, nil
+	return routes, defv4, defv6, nil
 }
 
 // removeAddress removes IP address from network device. It's equivalent to:
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index df3c0c5ef..ce1452b87 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -18,6 +18,7 @@ package sandbox
 import (
 	"context"
 	"fmt"
+	"math"
 	"os"
 	"os/exec"
 	"strconv"
@@ -351,7 +352,15 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		nextFD++
 	}
 	if conf.DebugLog != "" {
-		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot")
+		test := ""
+		if len(conf.TestOnlyTestNameEnv) != 0 {
+			// Fetch test name if one is provided and the test only flag was set.
+			if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+				test = t
+			}
+		}
+
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot", test)
 		if err != nil {
 			return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
 		}
@@ -623,6 +632,26 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
 		if err != nil {
 			return fmt.Errorf("getting cpu count from cgroups: %v", err)
 		}
+		if conf.CPUNumFromQuota {
+			// Dropping below 2 CPUs can trigger application to disable
+			// locks that can lead do hard to debug errors, so just
+			// leaving two cores as reasonable default.
+			const minCPUs = 2
+
+			quota, err := s.Cgroup.CPUQuota()
+			if err != nil {
+				return fmt.Errorf("getting cpu qouta from cgroups: %v", err)
+			}
+			if n := int(math.Ceil(quota)); n > 0 {
+				if n < minCPUs {
+					n = minCPUs
+				}
+				if n < cpuNum {
+					// Only lower the cpu number.
+					cpuNum = n
+				}
+			}
+		}
 		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
 
 		mem, err := s.Cgroup.MemoryLimit()
@@ -996,16 +1025,22 @@ func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
 // DestroyContainer destroys the given container. If it is the root container,
 // then the entire sandbox is destroyed.
 func (s *Sandbox) DestroyContainer(cid string) error {
+	if err := s.destroyContainer(cid); err != nil {
+		// If the sandbox isn't running, the container has already been destroyed,
+		// ignore the error in this case.
+		if s.IsRunning() {
+			return err
+		}
+	}
+	return nil
+}
+
+func (s *Sandbox) destroyContainer(cid string) error {
 	if s.IsRootContainer(cid) {
 		log.Debugf("Destroying root container %q by destroying sandbox", cid)
 		return s.destroy()
 	}
 
-	if !s.IsRunning() {
-		// Sandbox isn't running anymore, container is already destroyed.
-		return nil
-	}
-
 	log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index fbfb8e2f8..205638803 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
 go_library(
     name = "specutils",
     srcs = [
+        "cri.go",
         "fs.go",
         "namespace.go",
         "specutils.go",
@@ -13,6 +14,7 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/kernel/auth",
         "@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/specutils/cri.go b/runsc/specutils/cri.go
new file mode 100644
index 000000000..9c5877cd5
--- /dev/null
+++ b/runsc/specutils/cri.go
@@ -0,0 +1,110 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+const (
+	// ContainerdContainerTypeAnnotation is the OCI annotation set by
+	// containerd to indicate whether the container to create should have
+	// its own sandbox or a container within an existing sandbox.
+	ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+	// ContainerdContainerTypeContainer is the container type value
+	// indicating the container should be created in an existing sandbox.
+	ContainerdContainerTypeContainer = "container"
+	// ContainerdContainerTypeSandbox is the container type value
+	// indicating the container should be created in a new sandbox.
+	ContainerdContainerTypeSandbox = "sandbox"
+
+	// ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
+	// which sandbox the container should be created in when the container
+	// is not the first container in the sandbox.
+	ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
+
+	// CRIOContainerTypeAnnotation is the OCI annotation set by
+	// CRI-O to indicate whether the container to create should have
+	// its own sandbox or a container within an existing sandbox.
+	CRIOContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType"
+
+	// CRIOContainerTypeContainer is the container type value
+	// indicating the container should be created in an existing sandbox.
+	CRIOContainerTypeContainer = "container"
+	// CRIOContainerTypeSandbox is the container type value
+	// indicating the container should be created in a new sandbox.
+	CRIOContainerTypeSandbox = "sandbox"
+
+	// CRIOSandboxIDAnnotation is the OCI annotation set to indicate
+	// which sandbox the container should be created in when the container
+	// is not the first container in the sandbox.
+	CRIOSandboxIDAnnotation = "io.kubernetes.cri-o.SandboxID"
+)
+
+// ContainerType represents the type of container requested by the calling container manager.
+type ContainerType int
+
+const (
+	// ContainerTypeUnspecified indicates that no known container type
+	// annotation was found in the spec.
+	ContainerTypeUnspecified ContainerType = iota
+	// ContainerTypeUnknown indicates that a container type was specified
+	// but is unknown to us.
+	ContainerTypeUnknown
+	// ContainerTypeSandbox indicates that the container should be run in a
+	// new sandbox.
+	ContainerTypeSandbox
+	// ContainerTypeContainer indicates that the container should be run in
+	// an existing sandbox.
+	ContainerTypeContainer
+)
+
+// SpecContainerType tries to determine the type of container specified by the
+// container manager using well-known container annotations.
+func SpecContainerType(spec *specs.Spec) ContainerType {
+	if t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]; ok {
+		switch t {
+		case ContainerdContainerTypeSandbox:
+			return ContainerTypeSandbox
+		case ContainerdContainerTypeContainer:
+			return ContainerTypeContainer
+		default:
+			return ContainerTypeUnknown
+		}
+	}
+	if t, ok := spec.Annotations[CRIOContainerTypeAnnotation]; ok {
+		switch t {
+		case CRIOContainerTypeSandbox:
+			return ContainerTypeSandbox
+		case CRIOContainerTypeContainer:
+			return ContainerTypeContainer
+		default:
+			return ContainerTypeUnknown
+		}
+	}
+	return ContainerTypeUnspecified
+}
+
+// SandboxID returns the ID of the sandbox to join and whether an ID was found
+// in the spec.
+func SandboxID(spec *specs.Spec) (string, bool) {
+	if id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]; ok {
+		return id, true
+	}
+	if id, ok := spec.Annotations[CRIOSandboxIDAnnotation]; ok {
+		return id, true
+	}
+	return "", false
+}
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index d441419cb..c7dd3051c 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -33,19 +33,19 @@ import (
 func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
 	switch nst {
 	case specs.IPCNamespace:
-		return syscall.CLONE_NEWIPC
+		return unix.CLONE_NEWIPC
 	case specs.MountNamespace:
-		return syscall.CLONE_NEWNS
+		return unix.CLONE_NEWNS
 	case specs.NetworkNamespace:
-		return syscall.CLONE_NEWNET
+		return unix.CLONE_NEWNET
 	case specs.PIDNamespace:
-		return syscall.CLONE_NEWPID
+		return unix.CLONE_NEWPID
 	case specs.UTSNamespace:
-		return syscall.CLONE_NEWUTS
+		return unix.CLONE_NEWUTS
 	case specs.UserNamespace:
-		return syscall.CLONE_NEWUSER
+		return unix.CLONE_NEWUSER
 	case specs.CgroupNamespace:
-		panic("cgroup namespace has no associated clone flag")
+		return unix.CLONE_NEWCGROUP
 	default:
 		panic(fmt.Sprintf("unknown namespace %v", nst))
 	}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 2eec92349..d3c2e4e78 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -23,6 +23,7 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"syscall"
 	"time"
@@ -30,6 +31,7 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
@@ -90,7 +92,7 @@ func ValidateSpec(spec *specs.Spec) error {
 		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
 	}
 
-	// TODO(b/72226747): Apply seccomp to application inside sandbox.
+	// TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
 	if spec.Linux != nil && spec.Linux.Seccomp != nil {
 		log.Warningf("Seccomp spec is being ignored")
 	}
@@ -106,23 +108,18 @@ func ValidateSpec(spec *specs.Spec) error {
 		}
 	}
 
-	// Two annotations are use by containerd to support multi-container pods.
-	//   "io.kubernetes.cri.container-type"
-	//   "io.kubernetes.cri.sandbox-id"
-	containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation]
-	_, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation]
-	switch {
-	// Non-containerd use won't set a container type.
-	case !hasContainerType:
-	case containerType == ContainerdContainerTypeSandbox:
-	// When starting a container in an existing sandbox, the sandbox ID
-	// must be set.
-	case containerType == ContainerdContainerTypeContainer:
-		if !hasSandboxID {
-			return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType)
+	// CRI specifies whether a container should start a new sandbox, or run
+	// another container in an existing sandbox.
+	switch SpecContainerType(spec) {
+	case ContainerTypeContainer:
+		// When starting a container in an existing sandbox, the
+		// sandbox ID must be set.
+		if _, ok := SandboxID(spec); !ok {
+			return fmt.Errorf("spec has container-type of container, but no sandbox ID set")
 		}
+	case ContainerTypeUnknown:
+		return fmt.Errorf("unknown container-type")
 	default:
-		return fmt.Errorf("unknown container-type: %s", containerType)
 	}
 
 	return nil
@@ -240,6 +237,15 @@ func AllCapabilities() *specs.LinuxCapabilities {
 	}
 }
 
+// AllCapabilitiesUint64 returns a bitmask containing all capabilities set.
+func AllCapabilitiesUint64() uint64 {
+	var rv uint64
+	for _, cap := range capFromName {
+		rv |= bits.MaskOf64(int(cap))
+	}
+	return rv
+}
+
 var capFromName = map[string]linux.Capability{
 	"CAP_CHOWN":            linux.CAP_CHOWN,
 	"CAP_DAC_OVERRIDE":     linux.CAP_DAC_OVERRIDE,
@@ -327,39 +333,6 @@ func IsSupportedDevMount(m specs.Mount) bool {
 	return true
 }
 
-const (
-	// ContainerdContainerTypeAnnotation is the OCI annotation set by
-	// containerd to indicate whether the container to create should have
-	// its own sandbox or a container within an existing sandbox.
-	ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
-	// ContainerdContainerTypeContainer is the container type value
-	// indicating the container should be created in an existing sandbox.
-	ContainerdContainerTypeContainer = "container"
-	// ContainerdContainerTypeSandbox is the container type value
-	// indicating the container should be created in a new sandbox.
-	ContainerdContainerTypeSandbox = "sandbox"
-
-	// ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
-	// which sandbox the container should be created in when the container
-	// is not the first container in the sandbox.
-	ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
-)
-
-// ShouldCreateSandbox returns true if the spec indicates that a new sandbox
-// should be created for the container. If false, the container should be
-// started in an existing sandbox.
-func ShouldCreateSandbox(spec *specs.Spec) bool {
-	t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]
-	return !ok || t == ContainerdContainerTypeSandbox
-}
-
-// SandboxID returns the ID of the sandbox to join and whether an ID was found
-// in the spec.
-func SandboxID(spec *specs.Spec) (string, bool) {
-	id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]
-	return id, ok
-}
-
 // WaitForReady waits for a process to become ready. The process is ready when
 // the 'ready' function returns true. It continues to wait if 'ready' returns
 // false. It returns error on timeout, if the process stops or if 'ready' fails.
@@ -398,13 +371,15 @@ func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) er
 //   - %TIMESTAMP%: is replaced with a timestamp using the following format:
 //			<yyyymmdd-hhmmss.uuuuuu>
 //	 - %COMMAND%: is replaced with 'command'
-func DebugLogFile(logPattern, command string) (*os.File, error) {
+//	 - %TEST%: is replaced with 'test' (omitted by default)
+func DebugLogFile(logPattern, command, test string) (*os.File, error) {
 	if strings.HasSuffix(logPattern, "/") {
 		// Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
 		logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%"
 	}
 	logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1)
 	logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1)
+	logPattern = strings.Replace(logPattern, "%TEST%", test, -1)
 
 	dir := filepath.Dir(logPattern)
 	if err := os.MkdirAll(dir, 0775); err != nil {
@@ -503,3 +478,53 @@ func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
 		}
 	}
 }
+
+// GetOOMScoreAdj reads the given process' oom_score_adj
+func GetOOMScoreAdj(pid int) (int, error) {
+	data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid))
+	if err != nil {
+		return 0, err
+	}
+	return strconv.Atoi(strings.TrimSpace(string(data)))
+}
+
+// GetParentPid gets the parent process ID of the specified PID.
+func GetParentPid(pid int) (int, error) {
+	data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/stat", pid))
+	if err != nil {
+		return 0, err
+	}
+
+	var cpid string
+	var name string
+	var state string
+	var ppid int
+	// Parse after the binary name.
+	_, err = fmt.Sscanf(string(data),
+		"%v %v %v %d",
+		// cpid is ignored.
+		&cpid,
+		// name is ignored.
+		&name,
+		// state is ignored.
+		&state,
+		&ppid)
+
+	if err != nil {
+		return 0, err
+	}
+
+	return ppid, nil
+}
+
+// EnvVar looks for a varible value in the env slice assuming the following
+// format: "NAME=VALUE".
+func EnvVar(env []string, name string) (string, bool) {
+	prefix := name + "="
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			return strings.TrimPrefix(e, prefix), true
+		}
+	}
+	return "", false
+}
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index d44ebc906..c96ca2eb6 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -9,6 +9,7 @@ go_library(
     importpath = "gvisor.dev/gvisor/runsc/testutil",
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/log",
         "//runsc/boot",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 57ab73d97..9632776d2 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -25,13 +25,14 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
-	"log"
+	"math"
 	"math/rand"
 	"net/http"
 	"os"
 	"os/exec"
 	"os/signal"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -40,6 +41,7 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -149,13 +151,6 @@ func TestConfig() *boot.Config {
 	}
 }
 
-// TestConfigWithRoot returns the default configuration to use in tests.
-func TestConfigWithRoot(rootDir string) *boot.Config {
-	conf := TestConfig()
-	conf.RootDir = rootDir
-	return conf
-}
-
 // NewSpecWithArgs creates a simple spec with the given args suitable for use
 // in tests.
 func NewSpecWithArgs(args ...string) *specs.Spec {
@@ -284,7 +279,7 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 		url := fmt.Sprintf("http://localhost:%d/", port)
 		resp, err := c.Get(url)
 		if err != nil {
-			log.Printf("Waiting %s: %v", url, err)
+			log.Infof("Waiting %s: %v", url, err)
 			return err
 		}
 		resp.Body.Close()
@@ -438,3 +433,44 @@ func IsStatic(filename string) (bool, error) {
 	}
 	return true, nil
 }
+
+// TestBoundsForShard calculates the beginning and end indices for the test
+// based on the TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars. The
+// returned ints are the beginning (inclusive) and end (exclusive) of the
+// subslice corresponding to the shard. If either of the env vars are not
+// present, then the function will return bounds that include all tests. If
+// there are more shards than there are tests, then the returned list may be
+// empty.
+func TestBoundsForShard(numTests int) (int, int, error) {
+	var (
+		begin = 0
+		end   = numTests
+	)
+	indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
+	if indexStr == "" || totalStr == "" {
+		return begin, end, nil
+	}
+
+	// Parse index and total to ints.
+	shardIndex, err := strconv.Atoi(indexStr)
+	if err != nil {
+		return 0, 0, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
+	}
+	shardTotal, err := strconv.Atoi(totalStr)
+	if err != nil {
+		return 0, 0, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+	}
+
+	// Calculate!
+	shardSize := int(math.Ceil(float64(numTests) / float64(shardTotal)))
+	begin = shardIndex * shardSize
+	end = ((shardIndex + 1) * shardSize)
+	if begin > numTests {
+		// Nothing to run.
+		return 0, 0, nil
+	}
+	if end > numTests {
+		end = numTests
+	}
+	return begin, end, nil
+}
diff --git a/runsc/version.go b/runsc/version.go
index ce0573a9b..ab9194b9d 100644
--- a/runsc/version.go
+++ b/runsc/version.go
@@ -15,4 +15,4 @@
 package main
 
 // version is set during linking.
-var version = ""
+var version = "VERSION_MISSING"
diff --git a/runsc/version_test.sh b/runsc/version_test.sh
new file mode 100755
index 000000000..cc0ca3f05
--- /dev/null
+++ b/runsc/version_test.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euf -x -o pipefail
+
+readonly runsc="${TEST_SRCDIR}/__main__/runsc/linux_amd64_pure_stripped/runsc"
+readonly version=$($runsc --version)
+
+# Version should should not match VERSION, which is the default and which will
+# also appear if something is wrong with workspace_status.sh script.
+if [[ $version =~ "VERSION" ]]; then
+  echo "FAIL: Got bad version $version"
+  exit 1
+fi
+
+# Version should contain at least one number.
+if [[ ! $version =~ [0-9] ]]; then
+  echo "FAIL: Got bad version $version"
+  exit 1
+fi
+
+echo "PASS: Got OK version $version"
+exit 0